]> git.saurik.com Git - apple/libc.git/blame - db/btree/bt_split.c
Libc-320.1.3.tar.gz
[apple/libc.git] / db / btree / bt_split.c
CommitLineData
e9ce8d39 1/*
9385eb3d 2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
e9ce8d39
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
734aad71
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
e9ce8d39
A
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
734aad71
A
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
e9ce8d39
A
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
9385eb3d
A
23/*-
24 * Copyright (c) 1990, 1993, 1994
e9ce8d39
A
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Mike Olson.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 */
58
9385eb3d
A
59#if defined(LIBC_SCCS) && !defined(lint)
60static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
61#endif /* LIBC_SCCS and not lint */
62#include <sys/cdefs.h>
e9ce8d39
A
63
64#include <sys/types.h>
65
66#include <limits.h>
67#include <stdio.h>
68#include <stdlib.h>
69#include <string.h>
70
71#include <db.h>
72#include "btree.h"
73
9385eb3d
A
74static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
75static PAGE *bt_page (BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
76static int bt_preserve(BTREE *, pgno_t);
77static PAGE *bt_psplit (BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
78static PAGE *bt_root (BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
79static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
80static recno_t rec_total(PAGE *);
e9ce8d39
A
81
82#ifdef STATISTICS
83u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
84#endif
85
86/*
87 * __BT_SPLIT -- Split the tree.
88 *
89 * Parameters:
90 * t: tree
91 * sp: page to split
92 * key: key to insert
93 * data: data to insert
94 * flags: BIGKEY/BIGDATA flags
95 * ilen: insert length
96 * skip: index to leave open
97 *
98 * Returns:
99 * RET_ERROR, RET_SUCCESS
100 */
101int
9385eb3d 102__bt_split(t, sp, key, data, flags, ilen, argskip)
e9ce8d39
A
103 BTREE *t;
104 PAGE *sp;
105 const DBT *key, *data;
106 int flags;
107 size_t ilen;
9385eb3d 108 u_int32_t argskip;
e9ce8d39
A
109{
110 BINTERNAL *bi;
111 BLEAF *bl, *tbl;
112 DBT a, b;
113 EPGNO *parent;
114 PAGE *h, *l, *r, *lchild, *rchild;
115 indx_t nxtindex;
9385eb3d
A
116 u_int16_t skip;
117 u_int32_t n, nbytes, nksize;
e9ce8d39
A
118 int parentsplit;
119 char *dest;
120
121 /*
122 * Split the page into two pages, l and r. The split routines return
123 * a pointer to the page into which the key should be inserted and with
124 * skip set to the offset which should be used. Additionally, l and r
125 * are pinned.
126 */
9385eb3d 127 skip = argskip;
e9ce8d39
A
128 h = sp->pgno == P_ROOT ?
129 bt_root(t, sp, &l, &r, &skip, ilen) :
130 bt_page(t, sp, &l, &r, &skip, ilen);
131 if (h == NULL)
132 return (RET_ERROR);
133
134 /*
135 * Insert the new key/data pair into the leaf page. (Key inserts
136 * always cause a leaf page to split first.)
137 */
138 h->linp[skip] = h->upper -= ilen;
139 dest = (char *)h + h->upper;
9385eb3d 140 if (F_ISSET(t, R_RECNO))
e9ce8d39
A
141 WR_RLEAF(dest, data, flags)
142 else
143 WR_BLEAF(dest, key, data, flags)
144
145 /* If the root page was split, make it look right. */
146 if (sp->pgno == P_ROOT &&
9385eb3d 147 (F_ISSET(t, R_RECNO) ?
e9ce8d39
A
148 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
149 goto err2;
150
151 /*
152 * Now we walk the parent page stack -- a LIFO stack of the pages that
153 * were traversed when we searched for the page that split. Each stack
154 * entry is a page number and a page index offset. The offset is for
155 * the page traversed on the search. We've just split a page, so we
156 * have to insert a new key into the parent page.
157 *
158 * If the insert into the parent page causes it to split, may have to
159 * continue splitting all the way up the tree. We stop if the root
160 * splits or the page inserted into didn't have to split to hold the
161 * new key. Some algorithms replace the key for the old page as well
162 * as the new page. We don't, as there's no reason to believe that the
163 * first key on the old page is any better than the key we have, and,
164 * in the case of a key being placed at index 0 causing the split, the
165 * key is unavailable.
166 *
167 * There are a maximum of 5 pages pinned at any time. We keep the left
168 * and right pages pinned while working on the parent. The 5 are the
169 * two children, left parent and right parent (when the parent splits)
170 * and the root page or the overflow key page when calling bt_preserve.
171 * This code must make sure that all pins are released other than the
172 * root page or overflow page which is unlocked elsewhere.
173 */
174 while ((parent = BT_POP(t)) != NULL) {
175 lchild = l;
176 rchild = r;
177
178 /* Get the parent page. */
179 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
180 goto err2;
181
182 /*
183 * The new key goes ONE AFTER the index, because the split
184 * was to the right.
185 */
186 skip = parent->index + 1;
187
188 /*
189 * Calculate the space needed on the parent page.
190 *
191 * Prefix trees: space hack when inserting into BINTERNAL
192 * pages. Retain only what's needed to distinguish between
193 * the new entry and the LAST entry on the page to its left.
194 * If the keys compare equal, retain the entire key. Note,
195 * we don't touch overflow keys, and the entire key must be
196 * retained for the next-to-left most key on the leftmost
197 * page of each level, or the search will fail. Applicable
198 * ONLY to internal pages that have leaf pages as children.
199 * Further reduction of the key between pairs of internal
200 * pages loses too much information.
201 */
202 switch (rchild->flags & P_TYPE) {
203 case P_BINTERNAL:
204 bi = GETBINTERNAL(rchild, 0);
205 nbytes = NBINTERNAL(bi->ksize);
206 break;
207 case P_BLEAF:
208 bl = GETBLEAF(rchild, 0);
209 nbytes = NBINTERNAL(bl->ksize);
210 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
211 (h->prevpg != P_INVALID || skip > 1)) {
212 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
213 a.size = tbl->ksize;
214 a.data = tbl->bytes;
215 b.size = bl->ksize;
216 b.data = bl->bytes;
217 nksize = t->bt_pfx(&a, &b);
218 n = NBINTERNAL(nksize);
219 if (n < nbytes) {
220#ifdef STATISTICS
221 bt_pfxsaved += nbytes - n;
222#endif
223 nbytes = n;
224 } else
225 nksize = 0;
226 } else
227 nksize = 0;
228 break;
229 case P_RINTERNAL:
230 case P_RLEAF:
231 nbytes = NRINTERNAL;
232 break;
233 default:
234 abort();
235 }
236
237 /* Split the parent page if necessary or shift the indices. */
238 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
239 sp = h;
240 h = h->pgno == P_ROOT ?
241 bt_root(t, h, &l, &r, &skip, nbytes) :
242 bt_page(t, h, &l, &r, &skip, nbytes);
243 if (h == NULL)
244 goto err1;
245 parentsplit = 1;
246 } else {
247 if (skip < (nxtindex = NEXTINDEX(h)))
248 memmove(h->linp + skip + 1, h->linp + skip,
249 (nxtindex - skip) * sizeof(indx_t));
250 h->lower += sizeof(indx_t);
251 parentsplit = 0;
252 }
253
254 /* Insert the key into the parent page. */
9385eb3d 255 switch (rchild->flags & P_TYPE) {
e9ce8d39
A
256 case P_BINTERNAL:
257 h->linp[skip] = h->upper -= nbytes;
258 dest = (char *)h + h->linp[skip];
259 memmove(dest, bi, nbytes);
260 ((BINTERNAL *)dest)->pgno = rchild->pgno;
261 break;
262 case P_BLEAF:
263 h->linp[skip] = h->upper -= nbytes;
264 dest = (char *)h + h->linp[skip];
265 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
266 rchild->pgno, bl->flags & P_BIGKEY);
267 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
268 if (bl->flags & P_BIGKEY &&
269 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
270 goto err1;
271 break;
272 case P_RINTERNAL:
273 /*
274 * Update the left page count. If split
275 * added at index 0, fix the correct page.
276 */
277 if (skip > 0)
278 dest = (char *)h + h->linp[skip - 1];
279 else
280 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
281 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
282 ((RINTERNAL *)dest)->pgno = lchild->pgno;
283
284 /* Update the right page count. */
285 h->linp[skip] = h->upper -= nbytes;
286 dest = (char *)h + h->linp[skip];
287 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
288 ((RINTERNAL *)dest)->pgno = rchild->pgno;
289 break;
290 case P_RLEAF:
291 /*
292 * Update the left page count. If split
293 * added at index 0, fix the correct page.
294 */
295 if (skip > 0)
296 dest = (char *)h + h->linp[skip - 1];
297 else
298 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
299 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
300 ((RINTERNAL *)dest)->pgno = lchild->pgno;
301
302 /* Update the right page count. */
303 h->linp[skip] = h->upper -= nbytes;
304 dest = (char *)h + h->linp[skip];
305 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
306 ((RINTERNAL *)dest)->pgno = rchild->pgno;
307 break;
308 default:
309 abort();
310 }
311
312 /* Unpin the held pages. */
313 if (!parentsplit) {
314 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
315 break;
316 }
317
318 /* If the root page was split, make it look right. */
319 if (sp->pgno == P_ROOT &&
9385eb3d 320 (F_ISSET(t, R_RECNO) ?
e9ce8d39
A
321 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
322 goto err1;
323
324 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
325 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
326 }
327
328 /* Unpin the held pages. */
329 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
330 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
331
332 /* Clear any pages left on the stack. */
333 return (RET_SUCCESS);
334
335 /*
336 * If something fails in the above loop we were already walking back
337 * up the tree and the tree is now inconsistent. Nothing much we can
338 * do about it but release any memory we're holding.
339 */
340err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
341 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
342
343err2: mpool_put(t->bt_mp, l, 0);
344 mpool_put(t->bt_mp, r, 0);
345 __dbpanic(t->bt_dbp);
346 return (RET_ERROR);
347}
348
349/*
350 * BT_PAGE -- Split a non-root page of a btree.
351 *
352 * Parameters:
353 * t: tree
354 * h: root page
355 * lp: pointer to left page pointer
356 * rp: pointer to right page pointer
357 * skip: pointer to index to leave open
358 * ilen: insert length
359 *
360 * Returns:
361 * Pointer to page in which to insert or NULL on error.
362 */
363static PAGE *
364bt_page(t, h, lp, rp, skip, ilen)
365 BTREE *t;
366 PAGE *h, **lp, **rp;
367 indx_t *skip;
368 size_t ilen;
369{
370 PAGE *l, *r, *tp;
371 pgno_t npg;
372
373#ifdef STATISTICS
374 ++bt_split;
375#endif
376 /* Put the new right page for the split into place. */
377 if ((r = __bt_new(t, &npg)) == NULL)
378 return (NULL);
379 r->pgno = npg;
380 r->lower = BTDATAOFF;
381 r->upper = t->bt_psize;
382 r->nextpg = h->nextpg;
383 r->prevpg = h->pgno;
384 r->flags = h->flags & P_TYPE;
385
386 /*
387 * If we're splitting the last page on a level because we're appending
388 * a key to it (skip is NEXTINDEX()), it's likely that the data is
389 * sorted. Adding an empty page on the side of the level is less work
390 * and can push the fill factor much higher than normal. If we're
391 * wrong it's no big deal, we'll just do the split the right way next
392 * time. It may look like it's equally easy to do a similar hack for
393 * reverse sorted data, that is, split the tree left, but it's not.
394 * Don't even try.
395 */
396 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
397#ifdef STATISTICS
398 ++bt_sortsplit;
399#endif
400 h->nextpg = r->pgno;
401 r->lower = BTDATAOFF + sizeof(indx_t);
402 *skip = 0;
403 *lp = h;
404 *rp = r;
405 return (r);
406 }
407
408 /* Put the new left page for the split into place. */
409 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
410 mpool_put(t->bt_mp, r, 0);
411 return (NULL);
412 }
9385eb3d
A
413#ifdef PURIFY
414 memset(l, 0xff, t->bt_psize);
415#endif
e9ce8d39
A
416 l->pgno = h->pgno;
417 l->nextpg = r->pgno;
418 l->prevpg = h->prevpg;
419 l->lower = BTDATAOFF;
420 l->upper = t->bt_psize;
421 l->flags = h->flags & P_TYPE;
422
423 /* Fix up the previous pointer of the page after the split page. */
424 if (h->nextpg != P_INVALID) {
425 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
426 free(l);
427 /* XXX mpool_free(t->bt_mp, r->pgno); */
428 return (NULL);
429 }
430 tp->prevpg = r->pgno;
9385eb3d 431 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
e9ce8d39
A
432 }
433
434 /*
435 * Split right. The key/data pairs aren't sorted in the btree page so
436 * it's simpler to copy the data from the split page onto two new pages
437 * instead of copying half the data to the right page and compacting
438 * the left page in place. Since the left page can't change, we have
439 * to swap the original and the allocated left page after the split.
440 */
441 tp = bt_psplit(t, h, l, r, skip, ilen);
442
443 /* Move the new left page onto the old left page. */
444 memmove(h, l, t->bt_psize);
445 if (tp == l)
446 tp = h;
447 free(l);
448
449 *lp = h;
450 *rp = r;
451 return (tp);
452}
453
454/*
455 * BT_ROOT -- Split the root page of a btree.
456 *
457 * Parameters:
458 * t: tree
459 * h: root page
460 * lp: pointer to left page pointer
461 * rp: pointer to right page pointer
462 * skip: pointer to index to leave open
463 * ilen: insert length
464 *
465 * Returns:
466 * Pointer to page in which to insert or NULL on error.
467 */
468static PAGE *
469bt_root(t, h, lp, rp, skip, ilen)
470 BTREE *t;
471 PAGE *h, **lp, **rp;
472 indx_t *skip;
473 size_t ilen;
474{
475 PAGE *l, *r, *tp;
476 pgno_t lnpg, rnpg;
477
478#ifdef STATISTICS
479 ++bt_split;
480 ++bt_rootsplit;
481#endif
482 /* Put the new left and right pages for the split into place. */
483 if ((l = __bt_new(t, &lnpg)) == NULL ||
484 (r = __bt_new(t, &rnpg)) == NULL)
485 return (NULL);
486 l->pgno = lnpg;
487 r->pgno = rnpg;
488 l->nextpg = r->pgno;
489 r->prevpg = l->pgno;
490 l->prevpg = r->nextpg = P_INVALID;
491 l->lower = r->lower = BTDATAOFF;
492 l->upper = r->upper = t->bt_psize;
493 l->flags = r->flags = h->flags & P_TYPE;
494
495 /* Split the root page. */
496 tp = bt_psplit(t, h, l, r, skip, ilen);
497
498 *lp = l;
499 *rp = r;
500 return (tp);
501}
502
503/*
504 * BT_RROOT -- Fix up the recno root page after it has been split.
505 *
506 * Parameters:
507 * t: tree
508 * h: root page
509 * l: left page
510 * r: right page
511 *
512 * Returns:
513 * RET_ERROR, RET_SUCCESS
514 */
515static int
516bt_rroot(t, h, l, r)
517 BTREE *t;
518 PAGE *h, *l, *r;
519{
520 char *dest;
521
522 /* Insert the left and right keys, set the header information. */
523 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
524 dest = (char *)h + h->upper;
525 WR_RINTERNAL(dest,
526 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
527
528 h->linp[1] = h->upper -= NRINTERNAL;
529 dest = (char *)h + h->upper;
530 WR_RINTERNAL(dest,
531 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
532
533 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
534
535 /* Unpin the root page, set to recno internal page. */
536 h->flags &= ~P_TYPE;
537 h->flags |= P_RINTERNAL;
538 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
539
540 return (RET_SUCCESS);
541}
542
543/*
544 * BT_BROOT -- Fix up the btree root page after it has been split.
545 *
546 * Parameters:
547 * t: tree
548 * h: root page
549 * l: left page
550 * r: right page
551 *
552 * Returns:
553 * RET_ERROR, RET_SUCCESS
554 */
555static int
556bt_broot(t, h, l, r)
557 BTREE *t;
558 PAGE *h, *l, *r;
559{
560 BINTERNAL *bi;
561 BLEAF *bl;
9385eb3d 562 u_int32_t nbytes;
e9ce8d39
A
563 char *dest;
564
565 /*
566 * If the root page was a leaf page, change it into an internal page.
567 * We copy the key we split on (but not the key's data, in the case of
568 * a leaf page) to the new root page.
569 *
570 * The btree comparison code guarantees that the left-most key on any
571 * level of the tree is never used, so it doesn't need to be filled in.
572 */
573 nbytes = NBINTERNAL(0);
574 h->linp[0] = h->upper = t->bt_psize - nbytes;
575 dest = (char *)h + h->upper;
576 WR_BINTERNAL(dest, 0, l->pgno, 0);
577
9385eb3d 578 switch (h->flags & P_TYPE) {
e9ce8d39
A
579 case P_BLEAF:
580 bl = GETBLEAF(r, 0);
581 nbytes = NBINTERNAL(bl->ksize);
582 h->linp[1] = h->upper -= nbytes;
583 dest = (char *)h + h->upper;
584 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
585 memmove(dest, bl->bytes, bl->ksize);
586
587 /*
588 * If the key is on an overflow page, mark the overflow chain
589 * so it isn't deleted when the leaf copy of the key is deleted.
590 */
591 if (bl->flags & P_BIGKEY &&
592 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
593 return (RET_ERROR);
594 break;
595 case P_BINTERNAL:
596 bi = GETBINTERNAL(r, 0);
597 nbytes = NBINTERNAL(bi->ksize);
598 h->linp[1] = h->upper -= nbytes;
599 dest = (char *)h + h->upper;
600 memmove(dest, bi, nbytes);
601 ((BINTERNAL *)dest)->pgno = r->pgno;
602 break;
603 default:
604 abort();
605 }
606
607 /* There are two keys on the page. */
608 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
609
610 /* Unpin the root page, set to btree internal page. */
611 h->flags &= ~P_TYPE;
612 h->flags |= P_BINTERNAL;
613 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
614
615 return (RET_SUCCESS);
616}
617
618/*
619 * BT_PSPLIT -- Do the real work of splitting the page.
620 *
621 * Parameters:
622 * t: tree
623 * h: page to be split
624 * l: page to put lower half of data
625 * r: page to put upper half of data
626 * pskip: pointer to index to leave open
627 * ilen: insert length
628 *
629 * Returns:
630 * Pointer to page in which to insert.
631 */
632static PAGE *
633bt_psplit(t, h, l, r, pskip, ilen)
634 BTREE *t;
635 PAGE *h, *l, *r;
636 indx_t *pskip;
637 size_t ilen;
638{
639 BINTERNAL *bi;
640 BLEAF *bl;
9385eb3d 641 CURSOR *c;
e9ce8d39 642 RLEAF *rl;
e9ce8d39
A
643 PAGE *rval;
644 void *src;
645 indx_t full, half, nxt, off, skip, top, used;
9385eb3d 646 u_int32_t nbytes;
e9ce8d39
A
647 int bigkeycnt, isbigkey;
648
649 /*
650 * Split the data to the left and right pages. Leave the skip index
651 * open. Additionally, make some effort not to split on an overflow
652 * key. This makes internal page processing faster and can save
653 * space as overflow keys used by internal pages are never deleted.
654 */
655 bigkeycnt = 0;
656 skip = *pskip;
657 full = t->bt_psize - BTDATAOFF;
658 half = full / 2;
659 used = 0;
660 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
661 if (skip == off) {
662 nbytes = ilen;
663 isbigkey = 0; /* XXX: not really known. */
664 } else
665 switch (h->flags & P_TYPE) {
666 case P_BINTERNAL:
667 src = bi = GETBINTERNAL(h, nxt);
668 nbytes = NBINTERNAL(bi->ksize);
669 isbigkey = bi->flags & P_BIGKEY;
670 break;
671 case P_BLEAF:
672 src = bl = GETBLEAF(h, nxt);
673 nbytes = NBLEAF(bl);
674 isbigkey = bl->flags & P_BIGKEY;
675 break;
676 case P_RINTERNAL:
677 src = GETRINTERNAL(h, nxt);
678 nbytes = NRINTERNAL;
679 isbigkey = 0;
680 break;
681 case P_RLEAF:
682 src = rl = GETRLEAF(h, nxt);
683 nbytes = NRLEAF(rl);
684 isbigkey = 0;
685 break;
686 default:
687 abort();
688 }
689
690 /*
691 * If the key/data pairs are substantial fractions of the max
692 * possible size for the page, it's possible to get situations
693 * where we decide to try and copy too much onto the left page.
694 * Make sure that doesn't happen.
695 */
9385eb3d
A
696 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full)
697 || nxt == top - 1) {
e9ce8d39
A
698 --off;
699 break;
700 }
701
702 /* Copy the key/data pair, if not the skipped index. */
703 if (skip != off) {
704 ++nxt;
705
706 l->linp[off] = l->upper -= nbytes;
707 memmove((char *)l + l->upper, src, nbytes);
708 }
709
9385eb3d 710 used += nbytes + sizeof(indx_t);
e9ce8d39
A
711 if (used >= half) {
712 if (!isbigkey || bigkeycnt == 3)
713 break;
714 else
715 ++bigkeycnt;
716 }
717 }
718
719 /*
720 * Off is the last offset that's valid for the left page.
721 * Nxt is the first offset to be placed on the right page.
722 */
723 l->lower += (off + 1) * sizeof(indx_t);
724
725 /*
726 * If splitting the page that the cursor was on, the cursor has to be
727 * adjusted to point to the same record as before the split. If the
728 * cursor is at or past the skipped slot, the cursor is incremented by
729 * one. If the cursor is on the right page, it is decremented by the
730 * number of records split to the left page.
e9ce8d39 731 */
9385eb3d
A
732 c = &t->bt_cursor;
733 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
734 if (c->pg.index >= skip)
735 ++c->pg.index;
736 if (c->pg.index < nxt) /* Left page. */
737 c->pg.pgno = l->pgno;
e9ce8d39 738 else { /* Right page. */
9385eb3d
A
739 c->pg.pgno = r->pgno;
740 c->pg.index -= nxt;
e9ce8d39
A
741 }
742 }
743
744 /*
745 * If the skipped index was on the left page, just return that page.
746 * Otherwise, adjust the skip index to reflect the new position on
747 * the right page.
748 */
749 if (skip <= off) {
750 skip = 0;
751 rval = l;
752 } else {
753 rval = r;
754 *pskip -= nxt;
755 }
756
757 for (off = 0; nxt < top; ++off) {
758 if (skip == nxt) {
759 ++off;
760 skip = 0;
761 }
762 switch (h->flags & P_TYPE) {
763 case P_BINTERNAL:
764 src = bi = GETBINTERNAL(h, nxt);
765 nbytes = NBINTERNAL(bi->ksize);
766 break;
767 case P_BLEAF:
768 src = bl = GETBLEAF(h, nxt);
769 nbytes = NBLEAF(bl);
770 break;
771 case P_RINTERNAL:
772 src = GETRINTERNAL(h, nxt);
773 nbytes = NRINTERNAL;
774 break;
775 case P_RLEAF:
776 src = rl = GETRLEAF(h, nxt);
777 nbytes = NRLEAF(rl);
778 break;
779 default:
780 abort();
781 }
782 ++nxt;
783 r->linp[off] = r->upper -= nbytes;
784 memmove((char *)r + r->upper, src, nbytes);
785 }
786 r->lower += off * sizeof(indx_t);
787
788 /* If the key is being appended to the page, adjust the index. */
789 if (skip == top)
790 r->lower += sizeof(indx_t);
791
792 return (rval);
793}
794
795/*
796 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
797 *
798 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
799 * record that references them gets deleted. Chains pointed to by internal
800 * pages never get deleted. This routine marks a chain as pointed to by an
801 * internal page.
802 *
803 * Parameters:
804 * t: tree
805 * pg: page number of first page in the chain.
806 *
807 * Returns:
808 * RET_SUCCESS, RET_ERROR.
809 */
810static int
811bt_preserve(t, pg)
812 BTREE *t;
813 pgno_t pg;
814{
815 PAGE *h;
816
817 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
818 return (RET_ERROR);
819 h->flags |= P_PRESERVE;
820 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
821 return (RET_SUCCESS);
822}
823
824/*
825 * REC_TOTAL -- Return the number of recno entries below a page.
826 *
827 * Parameters:
828 * h: page
829 *
830 * Returns:
831 * The number of recno entries below a page.
832 *
833 * XXX
834 * These values could be set by the bt_psplit routine. The problem is that the
835 * entry has to be popped off of the stack etc. or the values have to be passed
836 * all the way back to bt_split/bt_rroot and it's not very clean.
837 */
838static recno_t
839rec_total(h)
840 PAGE *h;
841{
842 recno_t recs;
843 indx_t nxt, top;
844
845 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
846 recs += GETRINTERNAL(h, nxt)->nrecs;
847 return (recs);
848}