4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <linux/bitops.h>
26 #include <linux/vmalloc.h>
27 #include <linux/string.h>
28 #include <linux/drbd.h>
29 #include <asm/kmap_types.h>
32 /* OPAQUE outside this file!
33 * interface defined in drbd_int.h
36 * function name drbd_bm_... => used elsewhere, "public".
37 * function name bm_... => internal to implementation, "private".
39 * Note that since find_first_bit returns int, at the current granularity of
40 * the bitmap (4KB per byte), this implementation "only" supports up to
41 * 1<<(32+12) == 16 TB...
46 * Access to the *bm_pages is protected by bm_lock.
47 * It is safe to read the other members within the lock.
49 * drbd_bm_set_bits is called from bio_endio callbacks,
50 * We may be called with irq already disabled,
51 * so we need spin_lock_irqsave().
52 * And we need the kmap_atomic.
55 struct page
**bm_pages
;
57 /* WARNING unsigned long bm_*:
58 * 32bit number of bit offset is just enough for 512 MB bitmap.
59 * it will blow up if we make the bitmap bigger...
60 * not that it makes much sense to have a bitmap that large,
61 * rather change the granularity to 16k or 64k or something.
62 * (that implies other problems, however...)
64 unsigned long bm_set
; /* nr of set bits; THINK maybe atomic_t? */
65 unsigned long bm_bits
;
67 size_t bm_number_of_pages
;
68 sector_t bm_dev_capacity
;
69 struct semaphore bm_change
; /* serializes resize operations */
72 wait_queue_head_t bm_io_wait
;
74 unsigned long bm_flags
;
76 /* debugging aid, in case we are still racy somewhere */
78 struct task_struct
*bm_task
;
81 /* definition of bits in bm_flags */
83 #define BM_MD_IO_ERROR 1
84 #define BM_P_VMALLOCED 2
86 static int bm_is_locked(struct drbd_bitmap
*b
)
88 return test_bit(BM_LOCKED
, &b
->bm_flags
);
91 #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
92 static void __bm_print_lock_info(struct drbd_conf
*mdev
, const char *func
)
94 struct drbd_bitmap
*b
= mdev
->bitmap
;
95 if (!__ratelimit(&drbd_ratelimit_state
))
97 dev_err(DEV
, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
98 current
== mdev
->receiver
.task
? "receiver" :
99 current
== mdev
->asender
.task
? "asender" :
100 current
== mdev
->worker
.task
? "worker" : current
->comm
,
101 func
, b
->bm_why
?: "?",
102 b
->bm_task
== mdev
->receiver
.task
? "receiver" :
103 b
->bm_task
== mdev
->asender
.task
? "asender" :
104 b
->bm_task
== mdev
->worker
.task
? "worker" : "?");
107 void drbd_bm_lock(struct drbd_conf
*mdev
, char *why
)
109 struct drbd_bitmap
*b
= mdev
->bitmap
;
113 dev_err(DEV
, "FIXME no bitmap in drbd_bm_lock!?\n");
117 trylock_failed
= down_trylock(&b
->bm_change
);
119 if (trylock_failed
) {
120 dev_warn(DEV
, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
121 current
== mdev
->receiver
.task
? "receiver" :
122 current
== mdev
->asender
.task
? "asender" :
123 current
== mdev
->worker
.task
? "worker" : current
->comm
,
124 why
, b
->bm_why
?: "?",
125 b
->bm_task
== mdev
->receiver
.task
? "receiver" :
126 b
->bm_task
== mdev
->asender
.task
? "asender" :
127 b
->bm_task
== mdev
->worker
.task
? "worker" : "?");
130 if (__test_and_set_bit(BM_LOCKED
, &b
->bm_flags
))
131 dev_err(DEV
, "FIXME bitmap already locked in bm_lock\n");
134 b
->bm_task
= current
;
137 void drbd_bm_unlock(struct drbd_conf
*mdev
)
139 struct drbd_bitmap
*b
= mdev
->bitmap
;
141 dev_err(DEV
, "FIXME no bitmap in drbd_bm_unlock!?\n");
145 if (!__test_and_clear_bit(BM_LOCKED
, &mdev
->bitmap
->bm_flags
))
146 dev_err(DEV
, "FIXME bitmap not locked in bm_unlock\n");
153 /* word offset to long pointer */
154 static unsigned long *__bm_map_paddr(struct drbd_bitmap
*b
, unsigned long offset
, const enum km_type km
)
157 unsigned long page_nr
;
159 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
160 page_nr
= offset
>> (PAGE_SHIFT
- LN2_BPL
+ 3);
161 BUG_ON(page_nr
>= b
->bm_number_of_pages
);
162 page
= b
->bm_pages
[page_nr
];
164 return (unsigned long *) kmap_atomic(page
, km
);
167 static unsigned long * bm_map_paddr(struct drbd_bitmap
*b
, unsigned long offset
)
169 return __bm_map_paddr(b
, offset
, KM_IRQ1
);
172 static void __bm_unmap(unsigned long *p_addr
, const enum km_type km
)
174 kunmap_atomic(p_addr
, km
);
177 static void bm_unmap(unsigned long *p_addr
)
179 return __bm_unmap(p_addr
, KM_IRQ1
);
182 /* long word offset of _bitmap_ sector */
183 #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
184 /* word offset from start of bitmap to word number _in_page_
185 * modulo longs per page
186 #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
187 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
190 #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
192 /* Long words per page */
193 #define LWPP (PAGE_SIZE/sizeof(long))
196 * actually most functions herein should take a struct drbd_bitmap*, not a
197 * struct drbd_conf*, but for the debug macros I like to have the mdev around
198 * to be able to report device specific.
201 static void bm_free_pages(struct page
**pages
, unsigned long number
)
207 for (i
= 0; i
< number
; i
++) {
209 printk(KERN_ALERT
"drbd: bm_free_pages tried to free "
210 "a NULL pointer; i=%lu n=%lu\n",
214 __free_page(pages
[i
]);
219 static void bm_vk_free(void *ptr
, int v
)
228 * "have" and "want" are NUMBER OF PAGES.
230 static struct page
**bm_realloc_pages(struct drbd_bitmap
*b
, unsigned long want
)
232 struct page
**old_pages
= b
->bm_pages
;
233 struct page
**new_pages
, *page
;
234 unsigned int i
, bytes
, vmalloced
= 0;
235 unsigned long have
= b
->bm_number_of_pages
;
237 BUG_ON(have
== 0 && old_pages
!= NULL
);
238 BUG_ON(have
!= 0 && old_pages
== NULL
);
243 /* Trying kmalloc first, falling back to vmalloc.
244 * GFP_KERNEL is ok, as this is done when a lower level disk is
245 * "attached" to the drbd. Context is receiver thread or cqueue
246 * thread. As we have no disk yet, we are not in the IO path,
247 * not even the IO path of the peer. */
248 bytes
= sizeof(struct page
*)*want
;
249 new_pages
= kmalloc(bytes
, GFP_KERNEL
);
251 new_pages
= vmalloc(bytes
);
257 memset(new_pages
, 0, bytes
);
259 for (i
= 0; i
< have
; i
++)
260 new_pages
[i
] = old_pages
[i
];
261 for (; i
< want
; i
++) {
262 page
= alloc_page(GFP_HIGHUSER
);
264 bm_free_pages(new_pages
+ have
, i
- have
);
265 bm_vk_free(new_pages
, vmalloced
);
271 for (i
= 0; i
< want
; i
++)
272 new_pages
[i
] = old_pages
[i
];
273 /* NOT HERE, we are outside the spinlock!
274 bm_free_pages(old_pages + want, have - want);
279 set_bit(BM_P_VMALLOCED
, &b
->bm_flags
);
281 clear_bit(BM_P_VMALLOCED
, &b
->bm_flags
);
287 * called on driver init only. TODO call when a device is created.
288 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
290 int drbd_bm_init(struct drbd_conf
*mdev
)
292 struct drbd_bitmap
*b
= mdev
->bitmap
;
294 b
= kzalloc(sizeof(struct drbd_bitmap
), GFP_KERNEL
);
297 spin_lock_init(&b
->bm_lock
);
298 init_MUTEX(&b
->bm_change
);
299 init_waitqueue_head(&b
->bm_io_wait
);
306 sector_t
drbd_bm_capacity(struct drbd_conf
*mdev
)
308 ERR_IF(!mdev
->bitmap
) return 0;
309 return mdev
->bitmap
->bm_dev_capacity
;
312 /* called on driver unload. TODO: call when a device is destroyed.
314 void drbd_bm_cleanup(struct drbd_conf
*mdev
)
316 ERR_IF (!mdev
->bitmap
) return;
317 bm_free_pages(mdev
->bitmap
->bm_pages
, mdev
->bitmap
->bm_number_of_pages
);
318 bm_vk_free(mdev
->bitmap
->bm_pages
, test_bit(BM_P_VMALLOCED
, &mdev
->bitmap
->bm_flags
));
324 * since (b->bm_bits % BITS_PER_LONG) != 0,
325 * this masks out the remaining bits.
326 * Returns the number of bits cleared.
328 static int bm_clear_surplus(struct drbd_bitmap
*b
)
330 const unsigned long mask
= (1UL << (b
->bm_bits
& (BITS_PER_LONG
-1))) - 1;
331 size_t w
= b
->bm_bits
>> LN2_BPL
;
333 unsigned long *p_addr
, *bm
;
335 p_addr
= bm_map_paddr(b
, w
);
336 bm
= p_addr
+ MLPP(w
);
337 if (w
< b
->bm_words
) {
338 cleared
= hweight_long(*bm
& ~mask
);
343 if (w
< b
->bm_words
) {
344 cleared
+= hweight_long(*bm
);
351 static void bm_set_surplus(struct drbd_bitmap
*b
)
353 const unsigned long mask
= (1UL << (b
->bm_bits
& (BITS_PER_LONG
-1))) - 1;
354 size_t w
= b
->bm_bits
>> LN2_BPL
;
355 unsigned long *p_addr
, *bm
;
357 p_addr
= bm_map_paddr(b
, w
);
358 bm
= p_addr
+ MLPP(w
);
359 if (w
< b
->bm_words
) {
364 if (w
< b
->bm_words
) {
370 static unsigned long __bm_count_bits(struct drbd_bitmap
*b
, const int swap_endian
)
372 unsigned long *p_addr
, *bm
, offset
= 0;
373 unsigned long bits
= 0;
374 unsigned long i
, do_now
;
376 while (offset
< b
->bm_words
) {
377 i
= do_now
= min_t(size_t, b
->bm_words
-offset
, LWPP
);
378 p_addr
= __bm_map_paddr(b
, offset
, KM_USER0
);
379 bm
= p_addr
+ MLPP(offset
);
381 #ifndef __LITTLE_ENDIAN
383 *bm
= lel_to_cpu(*bm
);
385 bits
+= hweight_long(*bm
++);
387 __bm_unmap(p_addr
, KM_USER0
);
395 static unsigned long bm_count_bits(struct drbd_bitmap
*b
)
397 return __bm_count_bits(b
, 0);
400 static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap
*b
)
402 return __bm_count_bits(b
, 1);
405 /* offset and len in long words.*/
406 static void bm_memset(struct drbd_bitmap
*b
, size_t offset
, int c
, size_t len
)
408 unsigned long *p_addr
, *bm
;
411 #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
415 if (end
> b
->bm_words
) {
416 printk(KERN_ALERT
"drbd: bm_memset end > bm_words\n");
420 while (offset
< end
) {
421 do_now
= min_t(size_t, ALIGN(offset
+ 1, LWPP
), end
) - offset
;
422 p_addr
= bm_map_paddr(b
, offset
);
423 bm
= p_addr
+ MLPP(offset
);
424 if (bm
+do_now
> p_addr
+ LWPP
) {
425 printk(KERN_ALERT
"drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
426 p_addr
, bm
, (int)do_now
);
427 break; /* breaks to after catch_oob_access_end() only! */
429 memset(bm
, c
, do_now
* sizeof(long));
436 * make sure the bitmap has enough room for the attached storage,
437 * if necessary, resize.
438 * called whenever we may have changed the device size.
439 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
440 * In case this is actually a resize, we copy the old bitmap into the new one.
441 * Otherwise, the bitmap is initialized to all bits set.
443 int drbd_bm_resize(struct drbd_conf
*mdev
, sector_t capacity
)
445 struct drbd_bitmap
*b
= mdev
->bitmap
;
446 unsigned long bits
, words
, owords
, obits
, *p_addr
, *bm
;
447 unsigned long want
, have
, onpages
; /* number of pages */
448 struct page
**npages
, **opages
= NULL
;
449 int err
= 0, growing
;
450 int opages_vmalloced
;
452 ERR_IF(!b
) return -ENOMEM
;
454 drbd_bm_lock(mdev
, "resize");
456 dev_info(DEV
, "drbd_bm_resize called with capacity == %llu\n",
457 (unsigned long long)capacity
);
459 if (capacity
== b
->bm_dev_capacity
)
462 opages_vmalloced
= test_bit(BM_P_VMALLOCED
, &b
->bm_flags
);
465 spin_lock_irq(&b
->bm_lock
);
466 opages
= b
->bm_pages
;
467 onpages
= b
->bm_number_of_pages
;
468 owords
= b
->bm_words
;
470 b
->bm_number_of_pages
=
474 b
->bm_dev_capacity
= 0;
475 spin_unlock_irq(&b
->bm_lock
);
476 bm_free_pages(opages
, onpages
);
477 bm_vk_free(opages
, opages_vmalloced
);
480 bits
= BM_SECT_TO_BIT(ALIGN(capacity
, BM_SECT_PER_BIT
));
483 words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
484 a 32bit host could present the wrong number of words
487 words
= ALIGN(bits
, 64) >> LN2_BPL
;
489 if (get_ldev(mdev
)) {
490 D_ASSERT((u64
)bits
<= (((u64
)mdev
->ldev
->md
.md_size_sect
-MD_BM_OFFSET
) << 12));
494 /* one extra long to catch off by one errors */
495 want
= ALIGN((words
+1)*sizeof(long), PAGE_SIZE
) >> PAGE_SHIFT
;
496 have
= b
->bm_number_of_pages
;
498 D_ASSERT(b
->bm_pages
!= NULL
);
499 npages
= b
->bm_pages
;
501 if (FAULT_ACTIVE(mdev
, DRBD_FAULT_BM_ALLOC
))
504 npages
= bm_realloc_pages(b
, want
);
512 spin_lock_irq(&b
->bm_lock
);
513 opages
= b
->bm_pages
;
514 owords
= b
->bm_words
;
517 growing
= bits
> obits
;
521 b
->bm_pages
= npages
;
522 b
->bm_number_of_pages
= want
;
525 b
->bm_dev_capacity
= capacity
;
528 bm_memset(b
, owords
, 0xff, words
-owords
);
529 b
->bm_set
+= bits
- obits
;
533 /* implicit: (opages != NULL) && (opages != npages) */
534 bm_free_pages(opages
+ want
, have
- want
);
537 p_addr
= bm_map_paddr(b
, words
);
538 bm
= p_addr
+ MLPP(words
);
542 (void)bm_clear_surplus(b
);
544 spin_unlock_irq(&b
->bm_lock
);
545 if (opages
!= npages
)
546 bm_vk_free(opages
, opages_vmalloced
);
548 b
->bm_set
= bm_count_bits(b
);
549 dev_info(DEV
, "resync bitmap: bits=%lu words=%lu\n", bits
, words
);
552 drbd_bm_unlock(mdev
);
557 * if not protected by other means, return value may be out of date when
558 * leaving this function...
559 * we still need to lock it, since it is important that this returns
560 * bm_set == 0 precisely.
562 * maybe bm_set should be atomic_t ?
564 static unsigned long _drbd_bm_total_weight(struct drbd_conf
*mdev
)
566 struct drbd_bitmap
*b
= mdev
->bitmap
;
571 ERR_IF(!b
->bm_pages
) return 0;
573 spin_lock_irqsave(&b
->bm_lock
, flags
);
575 spin_unlock_irqrestore(&b
->bm_lock
, flags
);
580 unsigned long drbd_bm_total_weight(struct drbd_conf
*mdev
)
583 /* if I don't have a disk, I don't know about out-of-sync status */
584 if (!get_ldev_if_state(mdev
, D_NEGOTIATING
))
586 s
= _drbd_bm_total_weight(mdev
);
591 size_t drbd_bm_words(struct drbd_conf
*mdev
)
593 struct drbd_bitmap
*b
= mdev
->bitmap
;
595 ERR_IF(!b
->bm_pages
) return 0;
600 unsigned long drbd_bm_bits(struct drbd_conf
*mdev
)
602 struct drbd_bitmap
*b
= mdev
->bitmap
;
608 /* merge number words from buffer into the bitmap starting at offset.
609 * buffer[i] is expected to be little endian unsigned long.
610 * bitmap must be locked by drbd_bm_lock.
611 * currently only used from receive_bitmap.
613 void drbd_bm_merge_lel(struct drbd_conf
*mdev
, size_t offset
, size_t number
,
614 unsigned long *buffer
)
616 struct drbd_bitmap
*b
= mdev
->bitmap
;
617 unsigned long *p_addr
, *bm
;
618 unsigned long word
, bits
;
621 end
= offset
+ number
;
624 ERR_IF(!b
->bm_pages
) return;
627 WARN_ON(offset
>= b
->bm_words
);
628 WARN_ON(end
> b
->bm_words
);
630 spin_lock_irq(&b
->bm_lock
);
631 while (offset
< end
) {
632 do_now
= min_t(size_t, ALIGN(offset
+1, LWPP
), end
) - offset
;
633 p_addr
= bm_map_paddr(b
, offset
);
634 bm
= p_addr
+ MLPP(offset
);
637 bits
= hweight_long(*bm
);
638 word
= *bm
| lel_to_cpu(*buffer
++);
640 b
->bm_set
+= hweight_long(word
) - bits
;
644 /* with 32bit <-> 64bit cross-platform connect
645 * this is only correct for current usage,
646 * where we _know_ that we are 64 bit aligned,
647 * and know that this function is used in this way, too...
649 if (end
== b
->bm_words
)
650 b
->bm_set
-= bm_clear_surplus(b
);
652 spin_unlock_irq(&b
->bm_lock
);
655 /* copy number words from the bitmap starting at offset into the buffer.
656 * buffer[i] will be little endian unsigned long.
658 void drbd_bm_get_lel(struct drbd_conf
*mdev
, size_t offset
, size_t number
,
659 unsigned long *buffer
)
661 struct drbd_bitmap
*b
= mdev
->bitmap
;
662 unsigned long *p_addr
, *bm
;
665 end
= offset
+ number
;
668 ERR_IF(!b
->bm_pages
) return;
670 spin_lock_irq(&b
->bm_lock
);
671 if ((offset
>= b
->bm_words
) ||
672 (end
> b
->bm_words
) ||
674 dev_err(DEV
, "offset=%lu number=%lu bm_words=%lu\n",
675 (unsigned long) offset
,
676 (unsigned long) number
,
677 (unsigned long) b
->bm_words
);
679 while (offset
< end
) {
680 do_now
= min_t(size_t, ALIGN(offset
+1, LWPP
), end
) - offset
;
681 p_addr
= bm_map_paddr(b
, offset
);
682 bm
= p_addr
+ MLPP(offset
);
685 *buffer
++ = cpu_to_lel(*bm
++);
689 spin_unlock_irq(&b
->bm_lock
);
692 /* set all bits in the bitmap */
693 void drbd_bm_set_all(struct drbd_conf
*mdev
)
695 struct drbd_bitmap
*b
= mdev
->bitmap
;
697 ERR_IF(!b
->bm_pages
) return;
699 spin_lock_irq(&b
->bm_lock
);
700 bm_memset(b
, 0, 0xff, b
->bm_words
);
701 (void)bm_clear_surplus(b
);
702 b
->bm_set
= b
->bm_bits
;
703 spin_unlock_irq(&b
->bm_lock
);
706 /* clear all bits in the bitmap */
707 void drbd_bm_clear_all(struct drbd_conf
*mdev
)
709 struct drbd_bitmap
*b
= mdev
->bitmap
;
711 ERR_IF(!b
->bm_pages
) return;
713 spin_lock_irq(&b
->bm_lock
);
714 bm_memset(b
, 0, 0, b
->bm_words
);
716 spin_unlock_irq(&b
->bm_lock
);
719 static void bm_async_io_complete(struct bio
*bio
, int error
)
721 struct drbd_bitmap
*b
= bio
->bi_private
;
722 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
725 /* strange behavior of some lower level drivers...
726 * fail the request by clearing the uptodate flag,
727 * but do not return any error?!
728 * do we want to WARN() on this? */
729 if (!error
&& !uptodate
)
734 * for now, set all bits, and flag MD_IO_ERROR */
735 __set_bit(BM_MD_IO_ERROR
, &b
->bm_flags
);
737 if (atomic_dec_and_test(&b
->bm_async_io
))
738 wake_up(&b
->bm_io_wait
);
743 static void bm_page_io_async(struct drbd_conf
*mdev
, struct drbd_bitmap
*b
, int page_nr
, int rw
) __must_hold(local
)
745 /* we are process context. we always get a bio */
746 struct bio
*bio
= bio_alloc(GFP_KERNEL
, 1);
748 sector_t on_disk_sector
=
749 mdev
->ldev
->md
.md_offset
+ mdev
->ldev
->md
.bm_offset
;
750 on_disk_sector
+= ((sector_t
)page_nr
) << (PAGE_SHIFT
-9);
752 /* this might happen with very small
753 * flexible external meta data device */
754 len
= min_t(unsigned int, PAGE_SIZE
,
755 (drbd_md_last_sector(mdev
->ldev
) - on_disk_sector
+ 1)<<9);
757 bio
->bi_bdev
= mdev
->ldev
->md_bdev
;
758 bio
->bi_sector
= on_disk_sector
;
759 bio_add_page(bio
, b
->bm_pages
[page_nr
], len
, 0);
761 bio
->bi_end_io
= bm_async_io_complete
;
763 if (FAULT_ACTIVE(mdev
, (rw
& WRITE
) ? DRBD_FAULT_MD_WR
: DRBD_FAULT_MD_RD
)) {
765 bio_endio(bio
, -EIO
);
771 # if defined(__LITTLE_ENDIAN)
772 /* nothing to do, on disk == in memory */
773 # define bm_cpu_to_lel(x) ((void)0)
775 void bm_cpu_to_lel(struct drbd_bitmap
*b
)
777 /* need to cpu_to_lel all the pages ...
778 * this may be optimized by using
779 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
780 * the following is still not optimal, but better than nothing */
782 unsigned long *p_addr
, *bm
;
783 if (b
->bm_set
== 0) {
784 /* no page at all; avoid swap if all is 0 */
785 i
= b
->bm_number_of_pages
;
786 } else if (b
->bm_set
== b
->bm_bits
) {
787 /* only the last page */
788 i
= b
->bm_number_of_pages
- 1;
793 for (; i
< b
->bm_number_of_pages
; i
++) {
794 p_addr
= kmap_atomic(b
->bm_pages
[i
], KM_USER0
);
795 for (bm
= p_addr
; bm
< p_addr
+ PAGE_SIZE
/sizeof(long); bm
++)
796 *bm
= cpu_to_lel(*bm
);
797 kunmap_atomic(p_addr
, KM_USER0
);
801 /* lel_to_cpu == cpu_to_lel */
802 # define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
805 * bm_rw: read/write the whole bitmap from/to its on disk location.
807 static int bm_rw(struct drbd_conf
*mdev
, int rw
) __must_hold(local
)
809 struct drbd_bitmap
*b
= mdev
->bitmap
;
810 /* sector_t sector; */
811 int bm_words
, num_pages
, i
;
816 WARN_ON(!bm_is_locked(b
));
818 /* no spinlock here, the drbd_bm_lock should be enough! */
820 bm_words
= drbd_bm_words(mdev
);
821 num_pages
= (bm_words
*sizeof(long) + PAGE_SIZE
-1) >> PAGE_SHIFT
;
823 /* on disk bitmap is little endian */
828 atomic_set(&b
->bm_async_io
, num_pages
);
829 __clear_bit(BM_MD_IO_ERROR
, &b
->bm_flags
);
831 /* let the layers below us try to merge these bios... */
832 for (i
= 0; i
< num_pages
; i
++)
833 bm_page_io_async(mdev
, b
, i
, rw
);
835 drbd_blk_run_queue(bdev_get_queue(mdev
->ldev
->md_bdev
));
836 wait_event(b
->bm_io_wait
, atomic_read(&b
->bm_async_io
) == 0);
838 if (test_bit(BM_MD_IO_ERROR
, &b
->bm_flags
)) {
839 dev_alert(DEV
, "we had at least one MD IO ERROR during bitmap IO\n");
840 drbd_chk_io_error(mdev
, 1, TRUE
);
846 /* swap back endianness */
848 /* flush bitmap to stable storage */
850 } else /* rw == READ */ {
851 /* just read, if necessary adjust endianness */
852 b
->bm_set
= bm_count_bits_swap_endian(b
);
853 dev_info(DEV
, "recounting of set bits took additional %lu jiffies\n",
858 dev_info(DEV
, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
859 ppsize(ppb
, now
<< (BM_BLOCK_SHIFT
-10)), now
);
865 * drbd_bm_read() - Read the whole bitmap from its on disk location.
866 * @mdev: DRBD device.
868 int drbd_bm_read(struct drbd_conf
*mdev
) __must_hold(local
)
870 return bm_rw(mdev
, READ
);
874 * drbd_bm_write() - Write the whole bitmap to its on disk location.
875 * @mdev: DRBD device.
877 int drbd_bm_write(struct drbd_conf
*mdev
) __must_hold(local
)
879 return bm_rw(mdev
, WRITE
);
883 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
884 * @mdev: DRBD device.
885 * @enr: Extent number in the resync lru (happens to be sector offset)
887 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
888 * by a single sector write. Therefore enr == sector offset from the
889 * start of the bitmap.
891 int drbd_bm_write_sect(struct drbd_conf
*mdev
, unsigned long enr
) __must_hold(local
)
893 sector_t on_disk_sector
= enr
+ mdev
->ldev
->md
.md_offset
894 + mdev
->ldev
->md
.bm_offset
;
895 int bm_words
, num_words
, offset
;
898 mutex_lock(&mdev
->md_io_mutex
);
899 bm_words
= drbd_bm_words(mdev
);
900 offset
= S2W(enr
); /* word offset into bitmap */
901 num_words
= min(S2W(1), bm_words
- offset
);
902 if (num_words
< S2W(1))
903 memset(page_address(mdev
->md_io_page
), 0, MD_SECTOR_SIZE
);
904 drbd_bm_get_lel(mdev
, offset
, num_words
,
905 page_address(mdev
->md_io_page
));
906 if (!drbd_md_sync_page_io(mdev
, mdev
->ldev
, on_disk_sector
, WRITE
)) {
909 dev_err(DEV
, "IO ERROR writing bitmap sector %lu "
910 "(meta-disk sector %llus)\n",
911 enr
, (unsigned long long)on_disk_sector
);
912 drbd_chk_io_error(mdev
, 1, TRUE
);
913 for (i
= 0; i
< AL_EXT_PER_BM_SECT
; i
++)
914 drbd_bm_ALe_set_all(mdev
, enr
*AL_EXT_PER_BM_SECT
+i
);
917 mutex_unlock(&mdev
->md_io_mutex
);
922 * find_first_bit returns int, we return unsigned long.
923 * should not make much difference anyways, but ...
925 * this returns a bit number, NOT a sector!
927 #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
928 static unsigned long __bm_find_next(struct drbd_conf
*mdev
, unsigned long bm_fo
,
929 const int find_zero_bit
, const enum km_type km
)
931 struct drbd_bitmap
*b
= mdev
->bitmap
;
932 unsigned long i
= -1UL;
933 unsigned long *p_addr
;
934 unsigned long bit_offset
; /* bit offset of the mapped page. */
936 if (bm_fo
> b
->bm_bits
) {
937 dev_err(DEV
, "bm_fo=%lu bm_bits=%lu\n", bm_fo
, b
->bm_bits
);
939 while (bm_fo
< b
->bm_bits
) {
940 unsigned long offset
;
941 bit_offset
= bm_fo
& ~BPP_MASK
; /* bit offset of the page */
942 offset
= bit_offset
>> LN2_BPL
; /* word offset of the page */
943 p_addr
= __bm_map_paddr(b
, offset
, km
);
946 i
= find_next_zero_bit(p_addr
, PAGE_SIZE
*8, bm_fo
& BPP_MASK
);
948 i
= find_next_bit(p_addr
, PAGE_SIZE
*8, bm_fo
& BPP_MASK
);
950 __bm_unmap(p_addr
, km
);
951 if (i
< PAGE_SIZE
*8) {
957 bm_fo
= bit_offset
+ PAGE_SIZE
*8;
965 static unsigned long bm_find_next(struct drbd_conf
*mdev
,
966 unsigned long bm_fo
, const int find_zero_bit
)
968 struct drbd_bitmap
*b
= mdev
->bitmap
;
969 unsigned long i
= -1UL;
972 ERR_IF(!b
->bm_pages
) return i
;
974 spin_lock_irq(&b
->bm_lock
);
976 bm_print_lock_info(mdev
);
978 i
= __bm_find_next(mdev
, bm_fo
, find_zero_bit
, KM_IRQ1
);
980 spin_unlock_irq(&b
->bm_lock
);
984 unsigned long drbd_bm_find_next(struct drbd_conf
*mdev
, unsigned long bm_fo
)
986 return bm_find_next(mdev
, bm_fo
, 0);
990 /* not yet needed for anything. */
991 unsigned long drbd_bm_find_next_zero(struct drbd_conf
*mdev
, unsigned long bm_fo
)
993 return bm_find_next(mdev
, bm_fo
, 1);
997 /* does not spin_lock_irqsave.
998 * you must take drbd_bm_lock() first */
999 unsigned long _drbd_bm_find_next(struct drbd_conf
*mdev
, unsigned long bm_fo
)
1001 /* WARN_ON(!bm_is_locked(mdev)); */
1002 return __bm_find_next(mdev
, bm_fo
, 0, KM_USER1
);
1005 unsigned long _drbd_bm_find_next_zero(struct drbd_conf
*mdev
, unsigned long bm_fo
)
1007 /* WARN_ON(!bm_is_locked(mdev)); */
1008 return __bm_find_next(mdev
, bm_fo
, 1, KM_USER1
);
1011 /* returns number of bits actually changed.
1012 * for val != 0, we change 0 -> 1, return code positive
1013 * for val == 0, we change 1 -> 0, return code negative
1014 * wants bitnr, not sector.
1015 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1016 * Must hold bitmap lock already. */
1017 int __bm_change_bits_to(struct drbd_conf
*mdev
, const unsigned long s
,
1018 unsigned long e
, int val
, const enum km_type km
)
1020 struct drbd_bitmap
*b
= mdev
->bitmap
;
1021 unsigned long *p_addr
= NULL
;
1022 unsigned long bitnr
;
1023 unsigned long last_page_nr
= -1UL;
1026 if (e
>= b
->bm_bits
) {
1027 dev_err(DEV
, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1029 e
= b
->bm_bits
? b
->bm_bits
-1 : 0;
1031 for (bitnr
= s
; bitnr
<= e
; bitnr
++) {
1032 unsigned long offset
= bitnr
>>LN2_BPL
;
1033 unsigned long page_nr
= offset
>> (PAGE_SHIFT
- LN2_BPL
+ 3);
1034 if (page_nr
!= last_page_nr
) {
1036 __bm_unmap(p_addr
, km
);
1037 p_addr
= __bm_map_paddr(b
, offset
, km
);
1038 last_page_nr
= page_nr
;
1041 c
+= (0 == __test_and_set_bit(bitnr
& BPP_MASK
, p_addr
));
1043 c
-= (0 != __test_and_clear_bit(bitnr
& BPP_MASK
, p_addr
));
1046 __bm_unmap(p_addr
, km
);
1051 /* returns number of bits actually changed.
1052 * for val != 0, we change 0 -> 1, return code positive
1053 * for val == 0, we change 1 -> 0, return code negative
1054 * wants bitnr, not sector */
1055 int bm_change_bits_to(struct drbd_conf
*mdev
, const unsigned long s
,
1056 const unsigned long e
, int val
)
1058 unsigned long flags
;
1059 struct drbd_bitmap
*b
= mdev
->bitmap
;
1062 ERR_IF(!b
) return 1;
1063 ERR_IF(!b
->bm_pages
) return 0;
1065 spin_lock_irqsave(&b
->bm_lock
, flags
);
1066 if (bm_is_locked(b
))
1067 bm_print_lock_info(mdev
);
1069 c
= __bm_change_bits_to(mdev
, s
, e
, val
, KM_IRQ1
);
1071 spin_unlock_irqrestore(&b
->bm_lock
, flags
);
1075 /* returns number of bits changed 0 -> 1 */
1076 int drbd_bm_set_bits(struct drbd_conf
*mdev
, const unsigned long s
, const unsigned long e
)
1078 return bm_change_bits_to(mdev
, s
, e
, 1);
1081 /* returns number of bits changed 1 -> 0 */
1082 int drbd_bm_clear_bits(struct drbd_conf
*mdev
, const unsigned long s
, const unsigned long e
)
1084 return -bm_change_bits_to(mdev
, s
, e
, 0);
1087 /* sets all bits in full words,
1088 * from first_word up to, but not including, last_word */
1089 static inline void bm_set_full_words_within_one_page(struct drbd_bitmap
*b
,
1090 int page_nr
, int first_word
, int last_word
)
1094 unsigned long *paddr
= kmap_atomic(b
->bm_pages
[page_nr
], KM_USER0
);
1095 for (i
= first_word
; i
< last_word
; i
++) {
1096 bits
= hweight_long(paddr
[i
]);
1098 b
->bm_set
+= BITS_PER_LONG
- bits
;
1100 kunmap_atomic(paddr
, KM_USER0
);
1103 /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
1104 * You must first drbd_bm_lock().
1105 * Can be called to set the whole bitmap in one go.
1106 * Sets bits from s to e _inclusive_. */
1107 void _drbd_bm_set_bits(struct drbd_conf
*mdev
, const unsigned long s
, const unsigned long e
)
1109 /* First set_bit from the first bit (s)
1110 * up to the next long boundary (sl),
1111 * then assign full words up to the last long boundary (el),
1112 * then set_bit up to and including the last bit (e).
1114 * Do not use memset, because we must account for changes,
1115 * so we need to loop over the words with hweight() anyways.
1117 unsigned long sl
= ALIGN(s
,BITS_PER_LONG
);
1118 unsigned long el
= (e
+1) & ~((unsigned long)BITS_PER_LONG
-1);
1125 if (e
- s
<= 3*BITS_PER_LONG
) {
1126 /* don't bother; el and sl may even be wrong. */
1127 __bm_change_bits_to(mdev
, s
, e
, 1, KM_USER0
);
1131 /* difference is large enough that we can trust sl and el */
1133 /* bits filling the current long */
1135 __bm_change_bits_to(mdev
, s
, sl
-1, 1, KM_USER0
);
1137 first_page
= sl
>> (3 + PAGE_SHIFT
);
1138 last_page
= el
>> (3 + PAGE_SHIFT
);
1140 /* MLPP: modulo longs per page */
1141 /* LWPP: long words per page */
1142 first_word
= MLPP(sl
>> LN2_BPL
);
1145 /* first and full pages, unless first page == last page */
1146 for (page_nr
= first_page
; page_nr
< last_page
; page_nr
++) {
1147 bm_set_full_words_within_one_page(mdev
->bitmap
, page_nr
, first_word
, last_word
);
1152 /* last page (respectively only page, for first page == last page) */
1153 last_word
= MLPP(el
>> LN2_BPL
);
1154 bm_set_full_words_within_one_page(mdev
->bitmap
, last_page
, first_word
, last_word
);
1156 /* possibly trailing bits.
1157 * example: (e & 63) == 63, el will be e+1.
1158 * if that even was the very last bit,
1159 * it would trigger an assert in __bm_change_bits_to()
1162 __bm_change_bits_to(mdev
, el
, e
, 1, KM_USER0
);
1165 /* returns bit state
1166 * wants bitnr, NOT sector.
1167 * inherently racy... area needs to be locked by means of {al,rs}_lru
1170 * -1 ... first out of bounds access, stop testing for bits!
1172 int drbd_bm_test_bit(struct drbd_conf
*mdev
, const unsigned long bitnr
)
1174 unsigned long flags
;
1175 struct drbd_bitmap
*b
= mdev
->bitmap
;
1176 unsigned long *p_addr
;
1179 ERR_IF(!b
) return 0;
1180 ERR_IF(!b
->bm_pages
) return 0;
1182 spin_lock_irqsave(&b
->bm_lock
, flags
);
1183 if (bm_is_locked(b
))
1184 bm_print_lock_info(mdev
);
1185 if (bitnr
< b
->bm_bits
) {
1186 unsigned long offset
= bitnr
>>LN2_BPL
;
1187 p_addr
= bm_map_paddr(b
, offset
);
1188 i
= test_bit(bitnr
& BPP_MASK
, p_addr
) ? 1 : 0;
1190 } else if (bitnr
== b
->bm_bits
) {
1192 } else { /* (bitnr > b->bm_bits) */
1193 dev_err(DEV
, "bitnr=%lu > bm_bits=%lu\n", bitnr
, b
->bm_bits
);
1197 spin_unlock_irqrestore(&b
->bm_lock
, flags
);
1201 /* returns number of bits set in the range [s, e] */
1202 int drbd_bm_count_bits(struct drbd_conf
*mdev
, const unsigned long s
, const unsigned long e
)
1204 unsigned long flags
;
1205 struct drbd_bitmap
*b
= mdev
->bitmap
;
1206 unsigned long *p_addr
= NULL
, page_nr
= -1;
1207 unsigned long bitnr
;
1211 /* If this is called without a bitmap, that is a bug. But just to be
1212 * robust in case we screwed up elsewhere, in that case pretend there
1213 * was one dirty bit in the requested area, so we won't try to do a
1214 * local read there (no bitmap probably implies no disk) */
1215 ERR_IF(!b
) return 1;
1216 ERR_IF(!b
->bm_pages
) return 1;
1218 spin_lock_irqsave(&b
->bm_lock
, flags
);
1219 if (bm_is_locked(b
))
1220 bm_print_lock_info(mdev
);
1221 for (bitnr
= s
; bitnr
<= e
; bitnr
++) {
1222 w
= bitnr
>> LN2_BPL
;
1223 if (page_nr
!= w
>> (PAGE_SHIFT
- LN2_BPL
+ 3)) {
1224 page_nr
= w
>> (PAGE_SHIFT
- LN2_BPL
+ 3);
1227 p_addr
= bm_map_paddr(b
, w
);
1229 ERR_IF (bitnr
>= b
->bm_bits
) {
1230 dev_err(DEV
, "bitnr=%lu bm_bits=%lu\n", bitnr
, b
->bm_bits
);
1232 c
+= (0 != test_bit(bitnr
- (page_nr
<< (PAGE_SHIFT
+3)), p_addr
));
1237 spin_unlock_irqrestore(&b
->bm_lock
, flags
);
1242 /* inherently racy...
1243 * return value may be already out-of-date when this function returns.
1244 * but the general usage is that this is only use during a cstate when bits are
1245 * only cleared, not set, and typically only care for the case when the return
1246 * value is zero, or we already "locked" this "bitmap extent" by other means.
1248 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1249 * worth of the bitmap a "bitmap extent".
1252 * I think since we use it like a reference count, we should use the real
1253 * reference count of some bitmap extent element from some lru instead...
1256 int drbd_bm_e_weight(struct drbd_conf
*mdev
, unsigned long enr
)
1258 struct drbd_bitmap
*b
= mdev
->bitmap
;
1260 unsigned long flags
;
1261 unsigned long *p_addr
, *bm
;
1263 ERR_IF(!b
) return 0;
1264 ERR_IF(!b
->bm_pages
) return 0;
1266 spin_lock_irqsave(&b
->bm_lock
, flags
);
1267 if (bm_is_locked(b
))
1268 bm_print_lock_info(mdev
);
1271 e
= min((size_t)S2W(enr
+1), b
->bm_words
);
1273 if (s
< b
->bm_words
) {
1275 p_addr
= bm_map_paddr(b
, s
);
1276 bm
= p_addr
+ MLPP(s
);
1278 count
+= hweight_long(*bm
++);
1281 dev_err(DEV
, "start offset (%d) too large in drbd_bm_e_weight\n", s
);
1283 spin_unlock_irqrestore(&b
->bm_lock
, flags
);
1287 /* set all bits covered by the AL-extent al_enr */
1288 unsigned long drbd_bm_ALe_set_all(struct drbd_conf
*mdev
, unsigned long al_enr
)
1290 struct drbd_bitmap
*b
= mdev
->bitmap
;
1291 unsigned long *p_addr
, *bm
;
1292 unsigned long weight
;
1293 int count
, s
, e
, i
, do_now
;
1294 ERR_IF(!b
) return 0;
1295 ERR_IF(!b
->bm_pages
) return 0;
1297 spin_lock_irq(&b
->bm_lock
);
1298 if (bm_is_locked(b
))
1299 bm_print_lock_info(mdev
);
1302 s
= al_enr
* BM_WORDS_PER_AL_EXT
;
1303 e
= min_t(size_t, s
+ BM_WORDS_PER_AL_EXT
, b
->bm_words
);
1304 /* assert that s and e are on the same page */
1305 D_ASSERT((e
-1) >> (PAGE_SHIFT
- LN2_BPL
+ 3)
1306 == s
>> (PAGE_SHIFT
- LN2_BPL
+ 3));
1308 if (s
< b
->bm_words
) {
1310 p_addr
= bm_map_paddr(b
, s
);
1311 bm
= p_addr
+ MLPP(s
);
1313 count
+= hweight_long(*bm
);
1318 b
->bm_set
+= do_now
*BITS_PER_LONG
- count
;
1319 if (e
== b
->bm_words
)
1320 b
->bm_set
-= bm_clear_surplus(b
);
1322 dev_err(DEV
, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s
);
1324 weight
= b
->bm_set
- weight
;
1325 spin_unlock_irq(&b
->bm_lock
);