1 # --- T2-COPYRIGHT-NOTE-BEGIN ---
2 # T2 SDE: package/*/linux/up-next-md-bitmap.patch
3 # Copyright (C) 2024 The T2 SDE Project
5 # This Copyright note is generated by scripts/Create-CopyPatch,
6 # more information can be found in the files COPYING and README.
8 # This patch file is dual-licensed. It is available under the license the
9 # patched project is licensed under, as long as it is an OpenSource license
10 # as defined at http://www.opensource.org/ (e.g. BSD, X11) or under the terms
11 # of the GNU General Public License version 2 as used by the T2 SDE.
12 # --- T2-COPYRIGHT-NOTE-END ---
14 diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
15 index 29da10e6f703..349e8421799d 100644
16 --- a/drivers/md/md-bitmap.c
17 +++ b/drivers/md/md-bitmap.c
18 @@ -149,7 +149,18 @@ struct bitmap_page {
21 struct bitmap_counts {
24 + * Customize different types of lock structures to manage
25 + * data and metadata.
26 + * Split the counter table into multiple segments and assigns a
27 + * dedicated lock to each segment. The counters in the counter
28 + * table, which map to neighboring stripe blocks, are interleaved
29 + * across different segments.
30 + * CPU threads that target different segments can acquire the locks
31 + * simultaneously, resulting in better thread-level parallelism.
33 + rwlock_t mlock; /* lock for metadata */
34 + spinlock_t *bmclocks; /* locks for bmc */
35 struct bitmap_page *bp;
36 /* total number of pages in the bitmap */
38 @@ -246,10 +257,12 @@ static bool bitmap_enabled(struct mddev *mddev)
39 * if we find our page, we increment the page's refcount so that it stays
40 * allocated while we're using it
42 -static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
43 - unsigned long page, int create, int no_hijack)
44 -__releases(bitmap->lock)
45 -__acquires(bitmap->lock)
46 +static int md_bitmap_checkpage(struct bitmap_counts *bitmap, unsigned long page,
47 + int create, int no_hijack, spinlock_t *bmclock)
50 +__releases(bitmap->mlock)
51 +__acquires(bitmap->mlock)
53 unsigned char *mappage;
55 @@ -264,8 +277,10 @@ __acquires(bitmap->lock)
58 /* this page has not been allocated yet */
60 - spin_unlock_irq(&bitmap->lock);
62 + spin_unlock_irq(bmclock); /* lock for bmc */
64 + write_unlock_irq(&bitmap->mlock); /* lock for metadata */
65 /* It is possible that this is being called inside a
66 * prepare_to_wait/finish_wait loop from raid5c:make_request().
67 * In general it is not permitted to sleep in that context as it
68 @@ -280,7 +295,11 @@ __acquires(bitmap->lock)
70 sched_annotate_sleep();
71 mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
72 - spin_lock_irq(&bitmap->lock);
75 + spin_lock_irq(bmclock); /* lock for bmc */
77 + write_lock_irq(&bitmap->mlock); /* lock for metadata */
79 if (mappage == NULL) {
80 pr_debug("md/bitmap: map page allocation failed, hijacking\n");
81 @@ -1456,16 +1475,35 @@ static void bitmap_write_all(struct mddev *mddev)
82 static void md_bitmap_count_page(struct bitmap_counts *bitmap,
83 sector_t offset, int inc)
85 - sector_t chunk = offset >> bitmap->chunkshift;
86 - unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
88 + * The stripe heads are spread across different locations in the
89 + * SSDs via a configurable hash function rather than mapping to a
90 + * continuous SSD space.
91 + * Sequential write requests are shuffled to different counter to
92 + * reduce the counter preemption.
94 + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
95 + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
96 + unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
97 + unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
98 + (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
99 + unsigned long cntid = blockno & mask;
100 + unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
102 bitmap->bp[page].count += inc;
103 md_bitmap_checkfree(bitmap, page);
106 static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
108 - sector_t chunk = offset >> bitmap->chunkshift;
109 - unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
110 + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
111 + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
112 + unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
113 + unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
114 + (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
115 + unsigned long cntid = blockno & mask;
116 + unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
118 struct bitmap_page *bp = &bitmap->bp[page];
121 @@ -1474,7 +1512,7 @@ static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
123 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
124 sector_t offset, sector_t *blocks,
126 + int create, spinlock_t *bmclock);
128 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
130 @@ -1557,7 +1595,7 @@ static void bitmap_daemon_work(struct mddev *mddev)
131 * decrement and handle accordingly.
133 counts = &bitmap->counts;
134 - spin_lock_irq(&counts->lock);
135 + write_lock_irq(&counts->mlock);
137 for (j = 0; j < counts->chunks; j++) {
138 bitmap_counter_t *bmc;
139 @@ -1572,7 +1610,7 @@ static void bitmap_daemon_work(struct mddev *mddev)
140 counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
143 - bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
144 + bmc = md_bitmap_get_counter(counts, block, &blocks, 0, NULL);
146 j |= PAGE_COUNTER_MASK;
148 @@ -1588,7 +1626,7 @@ static void bitmap_daemon_work(struct mddev *mddev)
149 bitmap->allclean = 0;
152 - spin_unlock_irq(&counts->lock);
153 + write_unlock_irq(&counts->mlock);
155 md_bitmap_wait_writes(bitmap);
156 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
157 @@ -1621,17 +1659,25 @@ static void bitmap_daemon_work(struct mddev *mddev)
159 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
160 sector_t offset, sector_t *blocks,
162 -__releases(bitmap->lock)
163 -__acquires(bitmap->lock)
164 + int create, spinlock_t *bmclock)
167 +__releases(bitmap->mlock)
168 +__acquires(bitmap->mlock)
170 /* If 'create', we might release the lock and reclaim it.
171 * The lock must have been taken with interrupts enabled.
172 * If !create, we don't release the lock.
174 - sector_t chunk = offset >> bitmap->chunkshift;
175 - unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
176 - unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
177 + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
178 + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
179 + unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
180 + unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
181 + (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
182 + unsigned long cntid = blockno & mask;
183 + unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
184 + unsigned long pageoff = (cntid & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
186 sector_t csize = ((sector_t)1) << bitmap->chunkshift;
189 @@ -1644,7 +1690,7 @@ __acquires(bitmap->lock)
190 *blocks = csize - (offset & (csize - 1));
193 - err = md_bitmap_checkpage(bitmap, page, create, 0);
194 + err = md_bitmap_checkpage(bitmap, page, create, 0, bmclock);
196 if (bitmap->bp[page].hijacked ||
197 bitmap->bp[page].map == NULL)
198 @@ -1669,6 +1715,28 @@ __acquires(bitmap->lock)
199 &(bitmap->bp[page].map[pageoff]);
202 +/* set-association */
203 +static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap, sector_t offset);
205 +static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap, sector_t offset)
207 + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
208 + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
209 + unsigned long bitscnt = totblocks ? fls((totblocks - 1)) : 0;
210 + unsigned long maskcnt = ULONG_MAX << bitscnt | ~(ULONG_MAX << (bitscnt -
211 + (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
212 + unsigned long cntid = blockno & maskcnt;
214 + unsigned long totcnts = bitmap->chunks;
215 + unsigned long bitslock = totcnts ? fls((totcnts - 1)) : 0;
216 + unsigned long masklock = ULONG_MAX << bitslock | ~(ULONG_MAX <<
217 + (bitslock - BITMAP_COUNTER_LOCK_RATIO_SHIFT));
218 + unsigned long lockid = cntid & masklock;
220 + spinlock_t *bmclock = &(bitmap->bmclocks[lockid]);
224 static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
225 unsigned long sectors, bool behind)
227 @@ -1691,11 +1759,15 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
230 bitmap_counter_t *bmc;
231 + spinlock_t *bmclock;
233 - spin_lock_irq(&bitmap->counts.lock);
234 - bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
235 + bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
236 + read_lock(&bitmap->counts.mlock);
237 + spin_lock_irq(bmclock);
238 + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1, bmclock);
240 - spin_unlock_irq(&bitmap->counts.lock);
241 + spin_unlock_irq(bmclock);
242 + read_unlock(&bitmap->counts.mlock);
246 @@ -1707,7 +1779,8 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
248 prepare_to_wait(&bitmap->overflow_wait, &__wait,
249 TASK_UNINTERRUPTIBLE);
250 - spin_unlock_irq(&bitmap->counts.lock);
251 + spin_unlock_irq(bmclock);
252 + read_unlock(&bitmap->counts.mlock);
254 finish_wait(&bitmap->overflow_wait, &__wait);
256 @@ -1724,7 +1797,8 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
260 - spin_unlock_irq(&bitmap->counts.lock);
261 + spin_unlock_irq(bmclock);
262 + read_unlock(&bitmap->counts.mlock);
265 if (sectors > blocks)
266 @@ -1755,11 +1829,15 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
269 bitmap_counter_t *bmc;
270 + spinlock_t *bmclock;
272 - spin_lock_irqsave(&bitmap->counts.lock, flags);
273 - bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
274 + bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
275 + read_lock(&bitmap->counts.mlock);
276 + spin_lock_irqsave(bmclock, flags);
277 + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0, bmclock);
279 - spin_unlock_irqrestore(&bitmap->counts.lock, flags);
280 + spin_unlock_irqrestore(bmclock, flags);
281 + read_unlock(&bitmap->counts.mlock);
285 @@ -1781,7 +1859,8 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
286 md_bitmap_set_pending(&bitmap->counts, offset);
287 bitmap->allclean = 0;
289 - spin_unlock_irqrestore(&bitmap->counts.lock, flags);
290 + spin_unlock_irqrestore(bmclock, flags);
291 + read_unlock(&bitmap->counts.mlock);
293 if (sectors > blocks)
295 @@ -1794,16 +1873,18 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
296 sector_t *blocks, bool degraded)
298 bitmap_counter_t *bmc;
300 + spinlock_t *bmclock;
303 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
305 return true; /* always resync if no bitmap */
307 - spin_lock_irq(&bitmap->counts.lock);
310 - bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
311 + bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
312 + read_lock(&bitmap->counts.mlock);
313 + spin_lock_irq(bmclock);
314 + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock);
318 @@ -1816,7 +1897,8 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
322 - spin_unlock_irq(&bitmap->counts.lock);
323 + spin_unlock_irq(bmclock);
324 + read_unlock(&bitmap->counts.mlock);
328 @@ -1850,13 +1932,16 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
330 bitmap_counter_t *bmc;
332 + spinlock_t *bmclock;
334 if (bitmap == NULL) {
338 - spin_lock_irqsave(&bitmap->counts.lock, flags);
339 - bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
340 + bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
341 + read_lock(&bitmap->counts.mlock);
342 + spin_lock_irqsave(bmclock, flags);
343 + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock);
347 @@ -1873,7 +1958,8 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
351 - spin_unlock_irqrestore(&bitmap->counts.lock, flags);
352 + spin_unlock_irqrestore(bmclock, flags);
353 + read_unlock(&bitmap->counts.mlock);
356 static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
357 @@ -1961,10 +2047,15 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in
360 bitmap_counter_t *bmc;
361 - spin_lock_irq(&bitmap->counts.lock);
362 - bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
363 + spinlock_t *bmclock;
365 + bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
366 + read_lock(&bitmap->counts.mlock);
367 + spin_lock_irq(bmclock);
368 + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1, bmclock);
370 - spin_unlock_irq(&bitmap->counts.lock);
371 + spin_unlock_irq(bmclock);
372 + read_unlock(&bitmap->counts.mlock);
376 @@ -1975,7 +2066,8 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in
380 - spin_unlock_irq(&bitmap->counts.lock);
381 + spin_unlock_irq(bmclock);
382 + read_unlock(&bitmap->counts.mlock);
385 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
386 @@ -2030,6 +2122,7 @@ static void md_bitmap_free(void *data)
387 unsigned long k, pages;
388 struct bitmap_page *bp;
389 struct bitmap *bitmap = data;
390 + spinlock_t *bmclocks;
392 if (!bitmap) /* there was no bitmap */
394 @@ -2050,6 +2143,7 @@ static void md_bitmap_free(void *data)
396 bp = bitmap->counts.bp;
397 pages = bitmap->counts.pages;
398 + bmclocks = bitmap->counts.bmclocks;
400 /* free all allocated memory */
402 @@ -2058,6 +2152,7 @@ static void md_bitmap_free(void *data)
403 if (bp[k].map && !bp[k].hijacked)
410 @@ -2123,7 +2218,9 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
412 return ERR_PTR(-ENOMEM);
414 - spin_lock_init(&bitmap->counts.lock);
415 + /* initialize metadata lock */
416 + rwlock_init(&bitmap->counts.mlock);
418 atomic_set(&bitmap->pending_writes, 0);
419 init_waitqueue_head(&bitmap->write_wait);
420 init_waitqueue_head(&bitmap->overflow_wait);
421 @@ -2382,6 +2479,8 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
424 struct bitmap_page *new_bp;
425 + spinlock_t *new_bmclocks;
426 + int num_bmclocks, i;
428 if (bitmap->storage.file && !init) {
429 pr_info("md: cannot resize file-based bitmap\n");
430 @@ -2450,7 +2549,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
431 memcpy(page_address(store.sb_page),
432 page_address(bitmap->storage.sb_page),
433 sizeof(bitmap_super_t));
434 - spin_lock_irq(&bitmap->counts.lock);
435 + write_lock_irq(&bitmap->counts.mlock);
436 md_bitmap_file_unmap(&bitmap->storage);
437 bitmap->storage = store;
439 @@ -2466,11 +2565,23 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
440 blocks = min(old_counts.chunks << old_counts.chunkshift,
441 chunks << chunkshift);
443 + /* initialize bmc locks */
444 + num_bmclocks = DIV_ROUND_UP(chunks, BITMAP_COUNTER_LOCK_RATIO);
445 + num_bmclocks = min(num_bmclocks, BITMAP_COUNTER_LOCK_MAX);
447 + new_bmclocks = kvcalloc(num_bmclocks, sizeof(*new_bmclocks), GFP_KERNEL);
448 + bitmap->counts.bmclocks = new_bmclocks;
449 + for (i = 0; i < num_bmclocks; ++i) {
450 + spinlock_t *bmclock = &(bitmap->counts.bmclocks)[i];
452 + spin_lock_init(bmclock);
455 /* For cluster raid, need to pre-allocate bitmap */
456 if (mddev_is_clustered(bitmap->mddev)) {
458 for (page = 0; page < pages; page++) {
459 - ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
460 + ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1, NULL);
464 @@ -2500,11 +2611,12 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
465 bitmap_counter_t *bmc_old, *bmc_new;
468 - bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
469 + bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0, NULL);
470 set = bmc_old && NEEDED(*bmc_old);
473 - bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
474 + bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks,
478 /* need to set on-disk bits too. */
479 @@ -2540,7 +2652,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
481 while (block < (chunks << chunkshift)) {
482 bitmap_counter_t *bmc;
483 - bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
484 + bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1, NULL);
486 /* new space. It needs to be resynced, so
487 * we set NEEDED_MASK.
488 @@ -2556,7 +2668,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
489 for (i = 0; i < bitmap->storage.file_pages; i++)
490 set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
492 - spin_unlock_irq(&bitmap->counts.lock);
493 + write_unlock_irq(&bitmap->counts.mlock);
496 __bitmap_unplug(bitmap);
497 diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
498 index 662e6fc141a7..74dce7f5b34e 100644
499 --- a/drivers/md/md-bitmap.h
500 +++ b/drivers/md/md-bitmap.h
503 * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
505 - * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
507 + * Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
508 + * Copyright (C) 2022-2023, Shushu Yi (firnyee@gmail.com)
512 @@ -13,6 +15,10 @@ typedef __u16 bitmap_counter_t;
513 #define COUNTER_BITS 16
514 #define COUNTER_BIT_SHIFT 4
515 #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
516 +/* how many counters share the same bmclock? */
517 +#define BITMAP_COUNTER_LOCK_RATIO_SHIFT 0
518 +#define BITMAP_COUNTER_LOCK_RATIO (1 << BITMAP_COUNTER_LOCK_RATIO_SHIFT)
519 +#define BITMAP_COUNTER_LOCK_MAX 65536
521 #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
522 #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))