This source file includes following definitions.
- memcpy_disk
- array_insert
- bsearch
- lower_bound
- upper_bound
- inc_children
- insert_at
- calc_max_entries
- dm_btree_empty
- top_frame
- unprocessed_frames
- prefetch_children
- is_internal_level
- push_frame
- pop_frame
- unlock_all_frames
- dm_btree_del
- btree_lookup_raw
- dm_btree_lookup
- dm_btree_lookup_next_single
- dm_btree_lookup_next
- btree_split_sibling
- btree_split_beneath
- btree_insert_raw
- need_insert
- insert
- dm_btree_insert
- dm_btree_insert_notify
- find_key
- dm_btree_find_key
- dm_btree_find_highest_key
- dm_btree_find_lowest_key
- walk_node
- dm_btree_walk
- prefetch_values
- leaf_node
- push_node
- pop_node
- inc_or_backtrack
- find_leaf
- dm_btree_cursor_begin
- dm_btree_cursor_end
- dm_btree_cursor_next
- dm_btree_cursor_skip
- dm_btree_cursor_get_value
1
2
3
4
5
6
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13
14 #define DM_MSG_PREFIX "btree"
15
16
17
18
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21 {
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24 }
25
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29 {
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37
38
39
40
41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59 }
60
61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63 return bsearch(n, key, 0);
64 }
65
66 static int upper_bound(struct btree_node *n, uint64_t key)
67 {
68 return bsearch(n, key, 1);
69 }
70
71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 struct dm_btree_value_type *vt)
73 {
74 unsigned i;
75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76
77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78 for (i = 0; i < nr_entries; i++)
79 dm_tm_inc(tm, value64(n, i));
80 else if (vt->inc)
81 for (i = 0; i < nr_entries; i++)
82 vt->inc(vt->context, value_ptr(n, i));
83 }
84
85 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86 uint64_t key, void *value)
87 __dm_written_to_disk(value)
88 {
89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90 __le64 key_le = cpu_to_le64(key);
91
92 if (index > nr_entries ||
93 index >= le32_to_cpu(node->header.max_entries)) {
94 DMERR("too many entries in btree node for insert");
95 __dm_unbless_for_disk(value);
96 return -ENOMEM;
97 }
98
99 __dm_bless_for_disk(&key_le);
100
101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
102 array_insert(value_base(node), value_size, nr_entries, index, value);
103 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
104
105 return 0;
106 }
107
108
109
110
111
112
113
114 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
115 {
116 uint32_t total, n;
117 size_t elt_size = sizeof(uint64_t) + value_size;
118
119 block_size -= sizeof(struct node_header);
120 total = block_size / elt_size;
121 n = total / 3;
122
123 return 3 * n;
124 }
125
126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
127 {
128 int r;
129 struct dm_block *b;
130 struct btree_node *n;
131 size_t block_size;
132 uint32_t max_entries;
133
134 r = new_block(info, &b);
135 if (r < 0)
136 return r;
137
138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
139 max_entries = calc_max_entries(info->value_type.size, block_size);
140
141 n = dm_block_data(b);
142 memset(n, 0, block_size);
143 n->header.flags = cpu_to_le32(LEAF_NODE);
144 n->header.nr_entries = cpu_to_le32(0);
145 n->header.max_entries = cpu_to_le32(max_entries);
146 n->header.value_size = cpu_to_le32(info->value_type.size);
147
148 *root = dm_block_location(b);
149 unlock_block(info, b);
150
151 return 0;
152 }
153 EXPORT_SYMBOL_GPL(dm_btree_empty);
154
155
156
157
158
159
160
161 #define MAX_SPINE_DEPTH 64
162 struct frame {
163 struct dm_block *b;
164 struct btree_node *n;
165 unsigned level;
166 unsigned nr_children;
167 unsigned current_child;
168 };
169
170 struct del_stack {
171 struct dm_btree_info *info;
172 struct dm_transaction_manager *tm;
173 int top;
174 struct frame spine[MAX_SPINE_DEPTH];
175 };
176
177 static int top_frame(struct del_stack *s, struct frame **f)
178 {
179 if (s->top < 0) {
180 DMERR("btree deletion stack empty");
181 return -EINVAL;
182 }
183
184 *f = s->spine + s->top;
185
186 return 0;
187 }
188
189 static int unprocessed_frames(struct del_stack *s)
190 {
191 return s->top >= 0;
192 }
193
194 static void prefetch_children(struct del_stack *s, struct frame *f)
195 {
196 unsigned i;
197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
198
199 for (i = 0; i < f->nr_children; i++)
200 dm_bm_prefetch(bm, value64(f->n, i));
201 }
202
203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
204 {
205 return f->level < (info->levels - 1);
206 }
207
208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
209 {
210 int r;
211 uint32_t ref_count;
212
213 if (s->top >= MAX_SPINE_DEPTH - 1) {
214 DMERR("btree deletion stack out of memory");
215 return -ENOMEM;
216 }
217
218 r = dm_tm_ref(s->tm, b, &ref_count);
219 if (r)
220 return r;
221
222 if (ref_count > 1)
223
224
225
226
227 dm_tm_dec(s->tm, b);
228
229 else {
230 uint32_t flags;
231 struct frame *f = s->spine + ++s->top;
232
233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
234 if (r) {
235 s->top--;
236 return r;
237 }
238
239 f->n = dm_block_data(f->b);
240 f->level = level;
241 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
242 f->current_child = 0;
243
244 flags = le32_to_cpu(f->n->header.flags);
245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
246 prefetch_children(s, f);
247 }
248
249 return 0;
250 }
251
252 static void pop_frame(struct del_stack *s)
253 {
254 struct frame *f = s->spine + s->top--;
255
256 dm_tm_dec(s->tm, dm_block_location(f->b));
257 dm_tm_unlock(s->tm, f->b);
258 }
259
260 static void unlock_all_frames(struct del_stack *s)
261 {
262 struct frame *f;
263
264 while (unprocessed_frames(s)) {
265 f = s->spine + s->top--;
266 dm_tm_unlock(s->tm, f->b);
267 }
268 }
269
270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
271 {
272 int r;
273 struct del_stack *s;
274
275
276
277
278
279
280 s = kmalloc(sizeof(*s), GFP_NOFS);
281 if (!s)
282 return -ENOMEM;
283 s->info = info;
284 s->tm = info->tm;
285 s->top = -1;
286
287 r = push_frame(s, root, 0);
288 if (r)
289 goto out;
290
291 while (unprocessed_frames(s)) {
292 uint32_t flags;
293 struct frame *f;
294 dm_block_t b;
295
296 r = top_frame(s, &f);
297 if (r)
298 goto out;
299
300 if (f->current_child >= f->nr_children) {
301 pop_frame(s);
302 continue;
303 }
304
305 flags = le32_to_cpu(f->n->header.flags);
306 if (flags & INTERNAL_NODE) {
307 b = value64(f->n, f->current_child);
308 f->current_child++;
309 r = push_frame(s, b, f->level);
310 if (r)
311 goto out;
312
313 } else if (is_internal_level(info, f)) {
314 b = value64(f->n, f->current_child);
315 f->current_child++;
316 r = push_frame(s, b, f->level + 1);
317 if (r)
318 goto out;
319
320 } else {
321 if (info->value_type.dec) {
322 unsigned i;
323
324 for (i = 0; i < f->nr_children; i++)
325 info->value_type.dec(info->value_type.context,
326 value_ptr(f->n, i));
327 }
328 pop_frame(s);
329 }
330 }
331 out:
332 if (r) {
333
334 unlock_all_frames(s);
335 }
336 kfree(s);
337
338 return r;
339 }
340 EXPORT_SYMBOL_GPL(dm_btree_del);
341
342
343
344 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
345 int (*search_fn)(struct btree_node *, uint64_t),
346 uint64_t *result_key, void *v, size_t value_size)
347 {
348 int i, r;
349 uint32_t flags, nr_entries;
350
351 do {
352 r = ro_step(s, block);
353 if (r < 0)
354 return r;
355
356 i = search_fn(ro_node(s), key);
357
358 flags = le32_to_cpu(ro_node(s)->header.flags);
359 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
360 if (i < 0 || i >= nr_entries)
361 return -ENODATA;
362
363 if (flags & INTERNAL_NODE)
364 block = value64(ro_node(s), i);
365
366 } while (!(flags & LEAF_NODE));
367
368 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
369 memcpy(v, value_ptr(ro_node(s), i), value_size);
370
371 return 0;
372 }
373
374 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
375 uint64_t *keys, void *value_le)
376 {
377 unsigned level, last_level = info->levels - 1;
378 int r = -ENODATA;
379 uint64_t rkey;
380 __le64 internal_value_le;
381 struct ro_spine spine;
382
383 init_ro_spine(&spine, info);
384 for (level = 0; level < info->levels; level++) {
385 size_t size;
386 void *value_p;
387
388 if (level == last_level) {
389 value_p = value_le;
390 size = info->value_type.size;
391
392 } else {
393 value_p = &internal_value_le;
394 size = sizeof(uint64_t);
395 }
396
397 r = btree_lookup_raw(&spine, root, keys[level],
398 lower_bound, &rkey,
399 value_p, size);
400
401 if (!r) {
402 if (rkey != keys[level]) {
403 exit_ro_spine(&spine);
404 return -ENODATA;
405 }
406 } else {
407 exit_ro_spine(&spine);
408 return r;
409 }
410
411 root = le64_to_cpu(internal_value_le);
412 }
413 exit_ro_spine(&spine);
414
415 return r;
416 }
417 EXPORT_SYMBOL_GPL(dm_btree_lookup);
418
419 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
420 uint64_t key, uint64_t *rkey, void *value_le)
421 {
422 int r, i;
423 uint32_t flags, nr_entries;
424 struct dm_block *node;
425 struct btree_node *n;
426
427 r = bn_read_lock(info, root, &node);
428 if (r)
429 return r;
430
431 n = dm_block_data(node);
432 flags = le32_to_cpu(n->header.flags);
433 nr_entries = le32_to_cpu(n->header.nr_entries);
434
435 if (flags & INTERNAL_NODE) {
436 i = lower_bound(n, key);
437 if (i < 0) {
438
439
440
441
442 i = 0;
443 }
444 if (i >= nr_entries) {
445 r = -ENODATA;
446 goto out;
447 }
448
449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
450 if (r == -ENODATA && i < (nr_entries - 1)) {
451 i++;
452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
453 }
454
455 } else {
456 i = upper_bound(n, key);
457 if (i < 0 || i >= nr_entries) {
458 r = -ENODATA;
459 goto out;
460 }
461
462 *rkey = le64_to_cpu(n->keys[i]);
463 memcpy(value_le, value_ptr(n, i), info->value_type.size);
464 }
465 out:
466 dm_tm_unlock(info->tm, node);
467 return r;
468 }
469
470 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
471 uint64_t *keys, uint64_t *rkey, void *value_le)
472 {
473 unsigned level;
474 int r = -ENODATA;
475 __le64 internal_value_le;
476 struct ro_spine spine;
477
478 init_ro_spine(&spine, info);
479 for (level = 0; level < info->levels - 1u; level++) {
480 r = btree_lookup_raw(&spine, root, keys[level],
481 lower_bound, rkey,
482 &internal_value_le, sizeof(uint64_t));
483 if (r)
484 goto out;
485
486 if (*rkey != keys[level]) {
487 r = -ENODATA;
488 goto out;
489 }
490
491 root = le64_to_cpu(internal_value_le);
492 }
493
494 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
495 out:
496 exit_ro_spine(&spine);
497 return r;
498 }
499
500 EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
533 uint64_t key)
534 {
535 int r;
536 size_t size;
537 unsigned nr_left, nr_right;
538 struct dm_block *left, *right, *parent;
539 struct btree_node *ln, *rn, *pn;
540 __le64 location;
541
542 left = shadow_current(s);
543
544 r = new_block(s->info, &right);
545 if (r < 0)
546 return r;
547
548 ln = dm_block_data(left);
549 rn = dm_block_data(right);
550
551 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
552 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
553
554 ln->header.nr_entries = cpu_to_le32(nr_left);
555
556 rn->header.flags = ln->header.flags;
557 rn->header.nr_entries = cpu_to_le32(nr_right);
558 rn->header.max_entries = ln->header.max_entries;
559 rn->header.value_size = ln->header.value_size;
560 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
561
562 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
563 sizeof(uint64_t) : s->info->value_type.size;
564 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
565 size * nr_right);
566
567
568
569
570 parent = shadow_parent(s);
571
572 pn = dm_block_data(parent);
573 location = cpu_to_le64(dm_block_location(left));
574 __dm_bless_for_disk(&location);
575 memcpy_disk(value_ptr(pn, parent_index),
576 &location, sizeof(__le64));
577
578 location = cpu_to_le64(dm_block_location(right));
579 __dm_bless_for_disk(&location);
580
581 r = insert_at(sizeof(__le64), pn, parent_index + 1,
582 le64_to_cpu(rn->keys[0]), &location);
583 if (r) {
584 unlock_block(s->info, right);
585 return r;
586 }
587
588 if (key < le64_to_cpu(rn->keys[0])) {
589 unlock_block(s->info, right);
590 s->nodes[1] = left;
591 } else {
592 unlock_block(s->info, left);
593 s->nodes[1] = right;
594 }
595
596 return 0;
597 }
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
621 {
622 int r;
623 size_t size;
624 unsigned nr_left, nr_right;
625 struct dm_block *left, *right, *new_parent;
626 struct btree_node *pn, *ln, *rn;
627 __le64 val;
628
629 new_parent = shadow_current(s);
630
631 pn = dm_block_data(new_parent);
632 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
633 sizeof(__le64) : s->info->value_type.size;
634
635
636 r = new_block(s->info, &left);
637 if (r < 0)
638 return r;
639
640 ln = dm_block_data(left);
641 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
642
643 ln->header.flags = pn->header.flags;
644 ln->header.nr_entries = cpu_to_le32(nr_left);
645 ln->header.max_entries = pn->header.max_entries;
646 ln->header.value_size = pn->header.value_size;
647 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
648 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
649
650
651 r = new_block(s->info, &right);
652 if (r < 0) {
653 unlock_block(s->info, left);
654 return r;
655 }
656
657 rn = dm_block_data(right);
658 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
659
660 rn->header.flags = pn->header.flags;
661 rn->header.nr_entries = cpu_to_le32(nr_right);
662 rn->header.max_entries = pn->header.max_entries;
663 rn->header.value_size = pn->header.value_size;
664 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
665 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
666 nr_right * size);
667
668
669 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
670 pn->header.nr_entries = cpu_to_le32(2);
671 pn->header.max_entries = cpu_to_le32(
672 calc_max_entries(sizeof(__le64),
673 dm_bm_block_size(
674 dm_tm_get_bm(s->info->tm))));
675 pn->header.value_size = cpu_to_le32(sizeof(__le64));
676
677 val = cpu_to_le64(dm_block_location(left));
678 __dm_bless_for_disk(&val);
679 pn->keys[0] = ln->keys[0];
680 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
681
682 val = cpu_to_le64(dm_block_location(right));
683 __dm_bless_for_disk(&val);
684 pn->keys[1] = rn->keys[0];
685 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
686
687 unlock_block(s->info, left);
688 unlock_block(s->info, right);
689 return 0;
690 }
691
692 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
693 struct dm_btree_value_type *vt,
694 uint64_t key, unsigned *index)
695 {
696 int r, i = *index, top = 1;
697 struct btree_node *node;
698
699 for (;;) {
700 r = shadow_step(s, root, vt);
701 if (r < 0)
702 return r;
703
704 node = dm_block_data(shadow_current(s));
705
706
707
708
709
710
711 if (shadow_has_parent(s) && i >= 0) {
712 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
713
714 __dm_bless_for_disk(&location);
715 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
716 &location, sizeof(__le64));
717 }
718
719 node = dm_block_data(shadow_current(s));
720
721 if (node->header.nr_entries == node->header.max_entries) {
722 if (top)
723 r = btree_split_beneath(s, key);
724 else
725 r = btree_split_sibling(s, i, key);
726
727 if (r < 0)
728 return r;
729 }
730
731 node = dm_block_data(shadow_current(s));
732
733 i = lower_bound(node, key);
734
735 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
736 break;
737
738 if (i < 0) {
739
740 node->keys[0] = cpu_to_le64(key);
741 i = 0;
742 }
743
744 root = value64(node, i);
745 top = 0;
746 }
747
748 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
749 i++;
750
751 *index = i;
752 return 0;
753 }
754
755 static bool need_insert(struct btree_node *node, uint64_t *keys,
756 unsigned level, unsigned index)
757 {
758 return ((index >= le32_to_cpu(node->header.nr_entries)) ||
759 (le64_to_cpu(node->keys[index]) != keys[level]));
760 }
761
762 static int insert(struct dm_btree_info *info, dm_block_t root,
763 uint64_t *keys, void *value, dm_block_t *new_root,
764 int *inserted)
765 __dm_written_to_disk(value)
766 {
767 int r;
768 unsigned level, index = -1, last_level = info->levels - 1;
769 dm_block_t block = root;
770 struct shadow_spine spine;
771 struct btree_node *n;
772 struct dm_btree_value_type le64_type;
773
774 init_le64_type(info->tm, &le64_type);
775 init_shadow_spine(&spine, info);
776
777 for (level = 0; level < (info->levels - 1); level++) {
778 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
779 if (r < 0)
780 goto bad;
781
782 n = dm_block_data(shadow_current(&spine));
783
784 if (need_insert(n, keys, level, index)) {
785 dm_block_t new_tree;
786 __le64 new_le;
787
788 r = dm_btree_empty(info, &new_tree);
789 if (r < 0)
790 goto bad;
791
792 new_le = cpu_to_le64(new_tree);
793 __dm_bless_for_disk(&new_le);
794
795 r = insert_at(sizeof(uint64_t), n, index,
796 keys[level], &new_le);
797 if (r)
798 goto bad;
799 }
800
801 if (level < last_level)
802 block = value64(n, index);
803 }
804
805 r = btree_insert_raw(&spine, block, &info->value_type,
806 keys[level], &index);
807 if (r < 0)
808 goto bad;
809
810 n = dm_block_data(shadow_current(&spine));
811
812 if (need_insert(n, keys, level, index)) {
813 if (inserted)
814 *inserted = 1;
815
816 r = insert_at(info->value_type.size, n, index,
817 keys[level], value);
818 if (r)
819 goto bad_unblessed;
820 } else {
821 if (inserted)
822 *inserted = 0;
823
824 if (info->value_type.dec &&
825 (!info->value_type.equal ||
826 !info->value_type.equal(
827 info->value_type.context,
828 value_ptr(n, index),
829 value))) {
830 info->value_type.dec(info->value_type.context,
831 value_ptr(n, index));
832 }
833 memcpy_disk(value_ptr(n, index),
834 value, info->value_type.size);
835 }
836
837 *new_root = shadow_root(&spine);
838 exit_shadow_spine(&spine);
839
840 return 0;
841
842 bad:
843 __dm_unbless_for_disk(value);
844 bad_unblessed:
845 exit_shadow_spine(&spine);
846 return r;
847 }
848
849 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
850 uint64_t *keys, void *value, dm_block_t *new_root)
851 __dm_written_to_disk(value)
852 {
853 return insert(info, root, keys, value, new_root, NULL);
854 }
855 EXPORT_SYMBOL_GPL(dm_btree_insert);
856
857 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
858 uint64_t *keys, void *value, dm_block_t *new_root,
859 int *inserted)
860 __dm_written_to_disk(value)
861 {
862 return insert(info, root, keys, value, new_root, inserted);
863 }
864 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
865
866
867
868 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
869 uint64_t *result_key, dm_block_t *next_block)
870 {
871 int i, r;
872 uint32_t flags;
873
874 do {
875 r = ro_step(s, block);
876 if (r < 0)
877 return r;
878
879 flags = le32_to_cpu(ro_node(s)->header.flags);
880 i = le32_to_cpu(ro_node(s)->header.nr_entries);
881 if (!i)
882 return -ENODATA;
883 else
884 i--;
885
886 if (find_highest)
887 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
888 else
889 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
890
891 if (next_block || flags & INTERNAL_NODE) {
892 if (find_highest)
893 block = value64(ro_node(s), i);
894 else
895 block = value64(ro_node(s), 0);
896 }
897
898 } while (flags & INTERNAL_NODE);
899
900 if (next_block)
901 *next_block = block;
902 return 0;
903 }
904
905 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
906 bool find_highest, uint64_t *result_keys)
907 {
908 int r = 0, count = 0, level;
909 struct ro_spine spine;
910
911 init_ro_spine(&spine, info);
912 for (level = 0; level < info->levels; level++) {
913 r = find_key(&spine, root, find_highest, result_keys + level,
914 level == info->levels - 1 ? NULL : &root);
915 if (r == -ENODATA) {
916 r = 0;
917 break;
918
919 } else if (r)
920 break;
921
922 count++;
923 }
924 exit_ro_spine(&spine);
925
926 return r ? r : count;
927 }
928
929 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
930 uint64_t *result_keys)
931 {
932 return dm_btree_find_key(info, root, true, result_keys);
933 }
934 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
935
936 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
937 uint64_t *result_keys)
938 {
939 return dm_btree_find_key(info, root, false, result_keys);
940 }
941 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
942
943
944
945
946
947
948
949 static int walk_node(struct dm_btree_info *info, dm_block_t block,
950 int (*fn)(void *context, uint64_t *keys, void *leaf),
951 void *context)
952 {
953 int r;
954 unsigned i, nr;
955 struct dm_block *node;
956 struct btree_node *n;
957 uint64_t keys;
958
959 r = bn_read_lock(info, block, &node);
960 if (r)
961 return r;
962
963 n = dm_block_data(node);
964
965 nr = le32_to_cpu(n->header.nr_entries);
966 for (i = 0; i < nr; i++) {
967 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
968 r = walk_node(info, value64(n, i), fn, context);
969 if (r)
970 goto out;
971 } else {
972 keys = le64_to_cpu(*key_ptr(n, i));
973 r = fn(context, &keys, value_ptr(n, i));
974 if (r)
975 goto out;
976 }
977 }
978
979 out:
980 dm_tm_unlock(info->tm, node);
981 return r;
982 }
983
984 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
985 int (*fn)(void *context, uint64_t *keys, void *leaf),
986 void *context)
987 {
988 BUG_ON(info->levels > 1);
989 return walk_node(info, root, fn, context);
990 }
991 EXPORT_SYMBOL_GPL(dm_btree_walk);
992
993
994
995 static void prefetch_values(struct dm_btree_cursor *c)
996 {
997 unsigned i, nr;
998 __le64 value_le;
999 struct cursor_node *n = c->nodes + c->depth - 1;
1000 struct btree_node *bn = dm_block_data(n->b);
1001 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1002
1003 BUG_ON(c->info->value_type.size != sizeof(value_le));
1004
1005 nr = le32_to_cpu(bn->header.nr_entries);
1006 for (i = 0; i < nr; i++) {
1007 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1008 dm_bm_prefetch(bm, le64_to_cpu(value_le));
1009 }
1010 }
1011
1012 static bool leaf_node(struct dm_btree_cursor *c)
1013 {
1014 struct cursor_node *n = c->nodes + c->depth - 1;
1015 struct btree_node *bn = dm_block_data(n->b);
1016
1017 return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1018 }
1019
1020 static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1021 {
1022 int r;
1023 struct cursor_node *n = c->nodes + c->depth;
1024
1025 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1026 DMERR("couldn't push cursor node, stack depth too high");
1027 return -EINVAL;
1028 }
1029
1030 r = bn_read_lock(c->info, b, &n->b);
1031 if (r)
1032 return r;
1033
1034 n->index = 0;
1035 c->depth++;
1036
1037 if (c->prefetch_leaves || !leaf_node(c))
1038 prefetch_values(c);
1039
1040 return 0;
1041 }
1042
1043 static void pop_node(struct dm_btree_cursor *c)
1044 {
1045 c->depth--;
1046 unlock_block(c->info, c->nodes[c->depth].b);
1047 }
1048
1049 static int inc_or_backtrack(struct dm_btree_cursor *c)
1050 {
1051 struct cursor_node *n;
1052 struct btree_node *bn;
1053
1054 for (;;) {
1055 if (!c->depth)
1056 return -ENODATA;
1057
1058 n = c->nodes + c->depth - 1;
1059 bn = dm_block_data(n->b);
1060
1061 n->index++;
1062 if (n->index < le32_to_cpu(bn->header.nr_entries))
1063 break;
1064
1065 pop_node(c);
1066 }
1067
1068 return 0;
1069 }
1070
1071 static int find_leaf(struct dm_btree_cursor *c)
1072 {
1073 int r = 0;
1074 struct cursor_node *n;
1075 struct btree_node *bn;
1076 __le64 value_le;
1077
1078 for (;;) {
1079 n = c->nodes + c->depth - 1;
1080 bn = dm_block_data(n->b);
1081
1082 if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1083 break;
1084
1085 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1086 r = push_node(c, le64_to_cpu(value_le));
1087 if (r) {
1088 DMERR("push_node failed");
1089 break;
1090 }
1091 }
1092
1093 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1094 return -ENODATA;
1095
1096 return r;
1097 }
1098
1099 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1100 bool prefetch_leaves, struct dm_btree_cursor *c)
1101 {
1102 int r;
1103
1104 c->info = info;
1105 c->root = root;
1106 c->depth = 0;
1107 c->prefetch_leaves = prefetch_leaves;
1108
1109 r = push_node(c, root);
1110 if (r)
1111 return r;
1112
1113 return find_leaf(c);
1114 }
1115 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1116
1117 void dm_btree_cursor_end(struct dm_btree_cursor *c)
1118 {
1119 while (c->depth)
1120 pop_node(c);
1121 }
1122 EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1123
1124 int dm_btree_cursor_next(struct dm_btree_cursor *c)
1125 {
1126 int r = inc_or_backtrack(c);
1127 if (!r) {
1128 r = find_leaf(c);
1129 if (r)
1130 DMERR("find_leaf failed");
1131 }
1132
1133 return r;
1134 }
1135 EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1136
1137 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1138 {
1139 int r = 0;
1140
1141 while (count-- && !r)
1142 r = dm_btree_cursor_next(c);
1143
1144 return r;
1145 }
1146 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1147
1148 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1149 {
1150 if (c->depth) {
1151 struct cursor_node *n = c->nodes + c->depth - 1;
1152 struct btree_node *bn = dm_block_data(n->b);
1153
1154 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1155 return -EINVAL;
1156
1157 *key = le64_to_cpu(*key_ptr(bn, n->index));
1158 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1159 return 0;
1160
1161 } else
1162 return -ENODATA;
1163 }
1164 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);