+#else /* !__DARWIN_UNIX03 */
+
+typedef struct {
+ uint64_t high;
+ uint64_t low;
+} uint128_t;
+
+/* 128-bit addition: acc += add */
+static inline void
+add128_128(uint128_t *acc, uint128_t *add)
+{
+ acc->high += add->high;
+ acc->low += add->low;
+ if(acc->low < add->low)
+ acc->high++; // carry
+}
+
+/* 128-bit subtraction: acc -= sub */
+static inline void
+sub128_128(uint128_t *acc, uint128_t *sub)
+{
+ acc->high -= sub->high;
+ if(acc->low < sub->low)
+ acc->high--; // borrow
+ acc->low -= sub->low;
+}
+
+#define TWO64 (((double)(1ULL << 32)) * ((double)(1ULL << 32)))
+
+static inline double
+uint128_double(uint128_t *u)
+{
+ return TWO64 * u->high + u->low; // may loses precision
+}
+
+/* 64x64 -> 128 bit multiplication */
+static inline void
+mul64x64(uint64_t x, uint64_t y, uint128_t *prod)
+{
+ uint128_t add;
+ /*
+ * Split the two 64-bit multiplicands into 32-bit parts:
+ * x => 2^32 * x1 + x2
+ * y => 2^32 * y1 + y2
+ */
+ uint32_t x1 = (uint32_t)(x >> 32);
+ uint32_t x2 = (uint32_t)x;
+ uint32_t y1 = (uint32_t)(y >> 32);
+ uint32_t y2 = (uint32_t)y;
+ /*
+ * direct multiplication:
+ * x * y => 2^64 * (x1 * y1) + 2^32 (x1 * y2 + x2 * y1) + (x2 * y2)
+ * The first and last terms are direct assignmenet into the uint128_t
+ * structure. Then we add the middle two terms separately, to avoid
+ * 64-bit overflow. (We could use the Karatsuba algorithm to save
+ * one multiply, but it is harder to deal with 64-bit overflows.)
+ */
+ prod->high = (uint64_t)x1 * (uint64_t)y1;
+ prod->low = (uint64_t)x2 * (uint64_t)y2;
+ add.low = (uint64_t)x1 * (uint64_t)y2;
+ add.high = (add.low >> 32);
+ add.low <<= 32;
+ add128_128(prod, &add);
+ add.low = (uint64_t)x2 * (uint64_t)y1;
+ add.high = (add.low >> 32);
+ add.low <<= 32;
+ add128_128(prod, &add);
+}
+
+/* calculate (x * y / divisor), using 128-bit internal calculations */
+static int
+muldiv128(uint64_t x, uint64_t y, uint64_t divisor, uint64_t *res)
+{
+ uint128_t temp;
+ uint128_t divisor128 = {0, divisor};
+ uint64_t result = 0;
+ double recip;
+
+ /* calculate (x * y) */
+ mul64x64(x, y, &temp);
+ /*
+ * Now divide by the divisor. We use floating point to calculate an
+ * approximate answer and update the results. Then we iterate and
+ * calculate a correction from the difference.
+ */
+ recip = 1.0 / ((double)divisor);
+ while(temp.high || temp.low >= divisor) {
+ uint128_t backmul;
+ uint64_t uapprox;
+ double approx = uint128_double(&temp) * recip;
+
+ if(approx > __LONG_LONG_MAX__)
+ return 0; // answer overflows 64-bits
+ uapprox = (uint64_t)approx;
+ mul64x64(uapprox, divisor, &backmul);
+ /*
+ * Because we are using unsigned integers, we need to approach the
+ * answer from the lesser side. So if our estimate is too large
+ * we need to decrease it until it is smaller.
+ */
+ while(backmul.high > temp.high || backmul.high == temp.high && backmul.low > temp.low) {
+ sub128_128(&backmul, &divisor128);
+ uapprox--;
+ }
+ sub128_128(&temp, &backmul);
+ result += uapprox;
+ }
+ *res = result;
+ return 1;
+}