1 // SPDX-License-Identifier: GPL-2.0
5 * - Heavily based on MD badblocks code from Neil Brown
7 * Copyright (c) 2015, Intel Corporation.
10 #include <linux/badblocks.h>
11 #include <linux/seqlock.h>
12 #include <linux/device.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/stddef.h>
16 #include <linux/types.h>
17 #include <linux/slab.h>
20 * The purpose of badblocks set/clear is to manage bad blocks ranges which are
21 * identified by LBA addresses.
23 * When the caller of badblocks_set() wants to set a range of bad blocks, the
24 * setting range can be acked or unacked. And the setting range may merge,
25 * overwrite, skip the overlapped already set range, depends on who they are
26 * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
27 * more complicated when the setting range covers multiple already set bad block
28 * ranges, with restrictions of maximum length of each bad range and the bad
29 * table space limitation.
31 * It is difficult and unnecessary to take care of all the possible situations,
32 * for setting a large range of bad blocks, we can handle it by dividing the
33 * large range into smaller ones when encounter overlap, max range length or
34 * bad table full conditions. Every time only a smaller piece of the bad range
35 * is handled with a limited number of conditions how it is interacted with
36 * possible overlapped or adjacent already set bad block ranges. Then the hard
37 * complicated problem can be much simpler to handle in proper way.
39 * When setting a range of bad blocks to the bad table, the simplified situations
40 * to be considered are, (The already set bad blocks ranges are naming with
41 * prefix E, and the setting bad blocks range is naming with prefix S)
43 * 1) A setting range is not overlapped or adjacent to any other already set bad
48 * +-------------+ +-------------+
50 * +-------------+ +-------------+
51 * For this situation if the bad blocks table is not full, just allocate a
52 * free slot from the bad blocks table to mark the setting range S. The
54 * +-------------+ +--------+ +-------------+
56 * +-------------+ +--------+ +-------------+
57 * 2) A setting range starts exactly at a start LBA of an already set bad blocks
59 * 2.1) The setting range size < already set range size
66 * 2.1.1) If S and E are both acked or unacked range, the setting range S can
67 * be merged into existing bad range E. The result is,
71 * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
76 * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
77 * An extra slot from the bad blocks table will be allocated for S, and head
78 * of E will move to end of the inserted range S. The result is,
82 * 2.2) The setting range size == already set range size
83 * 2.2.1) If S and E are both acked or unacked range, the setting range S can
84 * be merged into existing bad range E. The result is,
88 * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
93 * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
94 bad blocks range E. The result is,
98 * 2.3) The setting range size > already set range size
99 * +-------------------+
101 * +-------------------+
105 * For such situation, the setting range S can be treated as two parts, the
106 * first part (S1) is as same size as the already set range E, the second
107 * part (S2) is the rest of setting range.
108 * +-------------+-----+ +-------------+ +-----+
109 * | S1 | S2 | | S1 | | S2 |
110 * +-------------+-----+ ===> +-------------+ +-----+
111 * +-------------+ +-------------+
113 * +-------------+ +-------------+
114 * Now we only focus on how to handle the setting range S1 and already set
115 * range E, which are already explained in 2.2), for the rest S2 it will be
116 * handled later in next loop.
117 * 3) A setting range starts before the start LBA of an already set bad blocks
125 * For this situation, the setting range S can be divided into two parts, the
126 * first (S1) ends at the start LBA of already set range E, the second part
127 * (S2) starts exactly at a start LBA of the already set range E.
128 * +----+---------+ +----+ +---------+
129 * | S1 | S2 | | S1 | | S2 |
130 * +----+---------+ ===> +----+ +---------+
131 * +-------------+ +-------------+
133 * +-------------+ +-------------+
134 * Now only the first part S1 should be handled in this loop, which is in
135 * similar condition as 1). The rest part S2 has exact same start LBA address
136 * of the already set range E, they will be handled in next loop in one of
138 * 4) A setting range starts after the start LBA of an already set bad blocks
140 * 4.1) If the setting range S exactly matches the tail part of already set bad
141 * blocks range E, like the following chart shows,
148 * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
149 * they will be merged into one, the result is,
153 * 4.1.2) If range E is acked and the setting range S is unacked, the setting
154 * request of S will be rejected, the result is,
158 * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
159 * overwrite the overlapped range of E, the result is,
163 * 4.2) If the setting range S stays in middle of an already set range E, like
164 * the following chart shows,
171 * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
172 * they will be merged into one, the result is,
176 * 4.2.2) If range E is acked and the setting range S is unacked, the setting
177 * request of S will be rejected, the result is also,
181 * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
182 * inserted into middle of E and split previous range E into two parts (E1
183 * and E2), the result is,
187 * 4.3) If the setting bad blocks range S is overlapped with an already set bad
188 * blocks range E. The range S starts after the start LBA of range E, and
189 * ends after the end LBA of range E, as the following chart shows,
190 * +-------------------+
192 * +-------------------+
196 * For this situation the range S can be divided into two parts, the first
197 * part (S1) ends at end range E, and the second part (S2) has rest range of
199 * +---------+---------+ +---------+ +---------+
200 * | S1 | S2 | | S1 | | S2 |
201 * +---------+---------+ ===> +---------+ +---------+
202 * +-------------+ +-------------+
204 * +-------------+ +-------------+
205 * Now in this loop the setting range S1 and already set range E can be
206 * handled as the situations 4.1), the rest range S2 will be handled in next
207 * loop and ignored in this loop.
208 * 5) A setting bad blocks range S is adjacent to one or more already set bad
209 * blocks range(s), and they are all acked or unacked range.
210 * 5.1) Front merge: If the already set bad blocks range E is before setting
211 * range S and they are adjacent,
218 * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
219 * values are same, the setting range S can front merges into range E. The
224 * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
225 * range S right after already set range E into the bad blocks table. The
230 * 6) Special cases which above conditions cannot handle
231 * 6.1) Multiple already set ranges may merge into less ones in a full bad table
232 * +-------------------------------------------------------+
234 * +-------------------------------------------------------+
235 * |<----- BB_MAX_LEN ----->|
236 * +-----+ +-----+ +-----+
237 * | E1 | | E2 | | E3 |
238 * +-----+ +-----+ +-----+
239 * In the above example, when the bad blocks table is full, inserting the
240 * first part of setting range S will fail because no more available slot
241 * can be allocated from bad blocks table. In this situation a proper
242 * setting method should be go though all the setting bad blocks range and
243 * look for chance to merge already set ranges into less ones. When there
244 * is available slot from bad blocks table, re-try again to handle more
245 * setting bad blocks ranges as many as possible.
246 * +------------------------+
248 * +------------------------+
249 * |<----- BB_MAX_LEN ----->|
250 * +-----+-----+-----+---+-----+--+
252 * +-----+-----+-----+---+-----+--+
253 * The above chart shows although the first part (S3) cannot be inserted due
254 * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
255 * can be merged with rest part of S into less range S1 and S2. Now there is
256 * 1 free slot in bad blocks table.
257 * +------------------------+-----+-----+-----+---+-----+--+
259 * +------------------------+-----+-----+-----+---+-----+--+
260 * Since the bad blocks table is not full anymore, re-try again for the
261 * origin setting range S. Now the setting range S3 can be inserted into the
262 * bad blocks table with previous freed slot from multiple ranges merge.
263 * 6.2) Front merge after overwrite
264 * In the following example, in bad blocks table, E1 is an acked bad blocks
265 * range and E2 is an unacked bad blocks range, therefore they are not able
266 * to merge into a larger range. The setting bad blocks range S is acked,
267 * therefore part of E2 can be overwritten by S.
271 * +-------+-------------+ E1: 1
273 * +-------+-------------+
274 * With previous simplified routines, after overwriting part of E2 with S,
275 * the bad blocks table should be (E3 is remaining part of E2 which is not
278 * +-------+--------+----+ S: 1
279 * | E1 | S | E3 | E1: 1
280 * +-------+--------+----+ E3: 0
281 * The above result is correct but not perfect. Range E1 and S in the bad
282 * blocks table are all acked, merging them into a larger one range may
283 * occupy less bad blocks table space and make badblocks_check() faster.
284 * Therefore in such situation, after overwriting range S, the previous range
285 * E1 should be checked for possible front combination. Then the ideal
287 * +----------------+----+ acknowledged
289 * +----------------+----+ E3: 0
290 * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
291 * range S and they are adjacent. Normally we don't need to care about this
292 * because front merge handles this while going though range S from head to
293 * tail, except for the tail part of range S. When the setting range S are
294 * fully handled, all the above simplified routine doesn't check whether the
295 * tail LBA of range S is adjacent to the next already set range and not
296 * merge them even it is possible.
303 * For the above special situation, when the setting range S are all handled
304 * and the loop ends, an extra check is necessary for whether next already
305 * set range E is right after S and mergeable.
306 * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
307 * values are same, the setting range S can behind merges into range E. The
312 * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
313 * S in front of the already set range E in the bad blocks table. The result
319 * All the above 5 simplified situations and 3 special cases may cover 99%+ of
320 * the bad block range setting conditions. Maybe there is some rare corner case
321 * is not considered and optimized, it won't hurt if badblocks_set() fails due
322 * to no space, or some ranges are not merged to save bad blocks table space.
324 * Inside badblocks_set() each loop starts by jumping to re_insert label, every
325 * time for the new loop prev_badblocks() is called to find an already set range
326 * which starts before or at current setting range. Since the setting bad blocks
327 * range is handled from head to tail, most of the cases it is unnecessary to do
328 * the binary search inside prev_badblocks(), it is possible to provide a hint
329 * to prev_badblocks() for a fast path, then the expensive binary search can be
330 * avoided. In my test with the hint to prev_badblocks(), except for the first
331 * loop, all rested calls to prev_badblocks() can go into the fast path and
332 * return correct bad blocks table index immediately.
335 * Clearing a bad blocks range from the bad block table has similar idea as
336 * setting does, but much more simpler. The only thing needs to be noticed is
337 * when the clearing range hits middle of a bad block range, the existing bad
338 * block range will split into two, and one more item should be added into the
339 * bad block table. The simplified situations to be considered are, (The already
340 * set bad blocks ranges in bad block table are naming with prefix E, and the
341 * clearing bad blocks range is naming with prefix C)
343 * 1) A clearing range is not overlapped to any already set ranges in bad block
345 * +-----+ | +-----+ | +-----+
346 * | C | | | C | | | C |
347 * +-----+ or +-----+ or +-----+
348 * +---+ | +----+ +----+ | +---+
349 * | E | | | E1 | | E2 | | | E |
350 * +---+ | +----+ +----+ | +---+
351 * For the above situations, no bad block to be cleared and no failure
352 * happens, simply returns 0.
353 * 2) The clearing range hits middle of an already setting bad blocks range in
354 * the bad block table.
358 * +-----------------+
360 * +-----------------+
361 * In this situation if the bad block table is not full, the range E will be
362 * split into two ranges E1 and E2. The result is,
366 * 3) The clearing range starts exactly at same LBA as an already set bad block range
367 * from the bad block table.
368 * 3.1) Partially covered at head part
372 * +-----------------+
374 * +-----------------+
375 * For this situation, the overlapped already set range will update the
376 * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
377 * item deleted from bad block table. The result is,
381 * 3.2) Exact fully covered
382 * +-----------------+
384 * +-----------------+
385 * +-----------------+
387 * +-----------------+
388 * For this situation the whole bad blocks range E will be cleared and its
389 * corresponded item is deleted from the bad block table.
390 * 4) The clearing range exactly ends at same LBA as an already set bad block
395 * +-----------------+
397 * +-----------------+
398 * For the above situation, the already set range E is updated to shrink its
399 * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
404 * 5) The clearing range is partially overlapped with an already set bad block
405 * range from the bad block table.
406 * 5.1) The already set bad block range is front overlapped with the clearing
414 * For such situation, the clearing range C can be treated as two parts. The
415 * first part ends at the start LBA of range E, and the second part starts at
416 * same LBA of range E.
417 * +----+-----+ +----+ +-----+
418 * | C1 | C2 | | C1 | | C2 |
419 * +----+-----+ ===> +----+ +-----+
420 * +------------+ +------------+
422 * +------------+ +------------+
423 * Now the first part C1 can be handled as condition 1), and the second part C2 can be
424 * handled as condition 3.1) in next loop.
425 * 5.2) The already set bad block range is behind overlaopped with the clearing
433 * For such situation, the clearing range C can be treated as two parts. The
434 * first part C1 ends at same end LBA of range E, and the second part starts
435 * at end LBA of range E.
436 * +----+-----+ +----+ +-----+
437 * | C1 | C2 | | C1 | | C2 |
438 * +----+-----+ ===> +----+ +-----+
439 * +------------+ +------------+
441 * +------------+ +------------+
442 * Now the first part clearing range C1 can be handled as condition 4), and
443 * the second part clearing range C2 can be handled as condition 1) in next
446 * All bad blocks range clearing can be simplified into the above 5 situations
447 * by only handling the head part of the clearing range in each run of the
448 * while-loop. The idea is similar to bad blocks range setting but much
453 * Find the range starts at-or-before 's' from bad table. The search
454 * starts from index 'hint' and stops at index 'hint_end' from the bad
457 static int prev_by_hint(struct badblocks
*bb
, sector_t s
, int hint
)
459 int hint_end
= hint
+ 2;
463 while ((hint
< hint_end
) && ((hint
+ 1) <= bb
->count
) &&
464 (BB_OFFSET(p
[hint
]) <= s
)) {
465 if ((hint
+ 1) == bb
->count
|| BB_OFFSET(p
[hint
+ 1]) > s
) {
476 * Find the range starts at-or-before bad->start. If 'hint' is provided
477 * (hint >= 0) then search in the bad table from hint firstly. It is
478 * very probably the wanted bad range can be found from the hint index,
479 * then the unnecessary while-loop iteration can be avoided.
481 static int prev_badblocks(struct badblocks
*bb
, struct badblocks_context
*bad
,
484 sector_t s
= bad
->start
;
493 ret
= prev_by_hint(bb
, s
, hint
);
502 /* The following bisect search might be unnecessary */
503 if (BB_OFFSET(p
[lo
]) > s
)
505 if (BB_OFFSET(p
[hi
- 1]) <= s
)
508 /* Do bisect search in bad table */
509 while (hi
- lo
> 1) {
510 int mid
= (lo
+ hi
)/2;
511 sector_t a
= BB_OFFSET(p
[mid
]);
524 if (BB_OFFSET(p
[lo
]) <= s
)
531 * Return 'true' if the range indicated by 'bad' can be backward merged
532 * with the bad range (from the bad table) index by 'behind'.
534 static bool can_merge_behind(struct badblocks
*bb
,
535 struct badblocks_context
*bad
, int behind
)
537 sector_t sectors
= bad
->len
;
538 sector_t s
= bad
->start
;
541 if ((s
< BB_OFFSET(p
[behind
])) &&
542 ((s
+ sectors
) >= BB_OFFSET(p
[behind
])) &&
543 ((BB_END(p
[behind
]) - s
) <= BB_MAX_LEN
) &&
544 BB_ACK(p
[behind
]) == bad
->ack
)
550 * Do backward merge for range indicated by 'bad' and the bad range
551 * (from the bad table) indexed by 'behind'. The return value is merged
552 * sectors from bad->len.
554 static int behind_merge(struct badblocks
*bb
, struct badblocks_context
*bad
,
557 sector_t sectors
= bad
->len
;
558 sector_t s
= bad
->start
;
562 WARN_ON(s
>= BB_OFFSET(p
[behind
]));
563 WARN_ON((s
+ sectors
) < BB_OFFSET(p
[behind
]));
565 if (s
< BB_OFFSET(p
[behind
])) {
566 merged
= BB_OFFSET(p
[behind
]) - s
;
567 p
[behind
] = BB_MAKE(s
, BB_LEN(p
[behind
]) + merged
, bad
->ack
);
569 WARN_ON((BB_LEN(p
[behind
]) + merged
) >= BB_MAX_LEN
);
576 * Return 'true' if the range indicated by 'bad' can be forward
577 * merged with the bad range (from the bad table) indexed by 'prev'.
579 static bool can_merge_front(struct badblocks
*bb
, int prev
,
580 struct badblocks_context
*bad
)
582 sector_t s
= bad
->start
;
585 if (BB_ACK(p
[prev
]) == bad
->ack
&&
586 (s
< BB_END(p
[prev
]) ||
587 (s
== BB_END(p
[prev
]) && (BB_LEN(p
[prev
]) < BB_MAX_LEN
))))
593 * Do forward merge for range indicated by 'bad' and the bad range
594 * (from bad table) indexed by 'prev'. The return value is sectors
595 * merged from bad->len.
597 static int front_merge(struct badblocks
*bb
, int prev
, struct badblocks_context
*bad
)
599 sector_t sectors
= bad
->len
;
600 sector_t s
= bad
->start
;
604 WARN_ON(s
> BB_END(p
[prev
]));
606 if (s
< BB_END(p
[prev
])) {
607 merged
= min_t(sector_t
, sectors
, BB_END(p
[prev
]) - s
);
609 merged
= min_t(sector_t
, sectors
, BB_MAX_LEN
- BB_LEN(p
[prev
]));
610 if ((prev
+ 1) < bb
->count
&&
611 merged
> (BB_OFFSET(p
[prev
+ 1]) - BB_END(p
[prev
]))) {
612 merged
= BB_OFFSET(p
[prev
+ 1]) - BB_END(p
[prev
]);
615 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
616 BB_LEN(p
[prev
]) + merged
, bad
->ack
);
623 * 'Combine' is a special case which can_merge_front() is not able to
624 * handle: If a bad range (indexed by 'prev' from bad table) exactly
625 * starts as bad->start, and the bad range ahead of 'prev' (indexed by
626 * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
627 * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
628 * these two bad range (from bad table) can be combined.
630 * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
631 * table can be combined.
633 static bool can_combine_front(struct badblocks
*bb
, int prev
,
634 struct badblocks_context
*bad
)
639 (BB_OFFSET(p
[prev
]) == bad
->start
) &&
640 (BB_END(p
[prev
- 1]) == BB_OFFSET(p
[prev
])) &&
641 (BB_LEN(p
[prev
- 1]) + BB_LEN(p
[prev
]) <= BB_MAX_LEN
) &&
642 (BB_ACK(p
[prev
- 1]) == BB_ACK(p
[prev
])))
648 * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
649 * table) into one larger bad range, and the new range is indexed by
651 * The caller of front_combine() will decrease bb->count, therefore
652 * it is unnecessary to clear p[perv] after front merge.
654 static void front_combine(struct badblocks
*bb
, int prev
)
658 p
[prev
- 1] = BB_MAKE(BB_OFFSET(p
[prev
- 1]),
659 BB_LEN(p
[prev
- 1]) + BB_LEN(p
[prev
]),
661 if ((prev
+ 1) < bb
->count
)
662 memmove(p
+ prev
, p
+ prev
+ 1, (bb
->count
- prev
- 1) * 8);
666 * Return 'true' if the range indicated by 'bad' is exactly forward
667 * overlapped with the bad range (from bad table) indexed by 'front'.
668 * Exactly forward overlap means the bad range (from bad table) indexed
669 * by 'prev' does not cover the whole range indicated by 'bad'.
671 static bool overlap_front(struct badblocks
*bb
, int front
,
672 struct badblocks_context
*bad
)
676 if (bad
->start
>= BB_OFFSET(p
[front
]) &&
677 bad
->start
< BB_END(p
[front
]))
683 * Return 'true' if the range indicated by 'bad' is exactly backward
684 * overlapped with the bad range (from bad table) indexed by 'behind'.
686 static bool overlap_behind(struct badblocks
*bb
, struct badblocks_context
*bad
,
691 if (bad
->start
< BB_OFFSET(p
[behind
]) &&
692 (bad
->start
+ bad
->len
) > BB_OFFSET(p
[behind
]))
698 * Return 'true' if the range indicated by 'bad' can overwrite the bad
699 * range (from bad table) indexed by 'prev'.
701 * The range indicated by 'bad' can overwrite the bad range indexed by
703 * 1) The whole range indicated by 'bad' can cover partial or whole bad
704 * range (from bad table) indexed by 'prev'.
705 * 2) The ack value of 'bad' is larger or equal to the ack value of bad
708 * If the overwriting doesn't cover the whole bad range (from bad table)
709 * indexed by 'prev', new range might be split from existing bad range,
710 * 1) The overwrite covers head or tail part of existing bad range, 1
711 * extra bad range will be split and added into the bad table.
712 * 2) The overwrite covers middle of existing bad range, 2 extra bad
713 * ranges will be split (ahead and after the overwritten range) and
714 * added into the bad table.
715 * The number of extra split ranges of the overwriting is stored in
716 * 'extra' and returned for the caller.
718 static bool can_front_overwrite(struct badblocks
*bb
, int prev
,
719 struct badblocks_context
*bad
, int *extra
)
724 WARN_ON(!overlap_front(bb
, prev
, bad
));
726 if (BB_ACK(p
[prev
]) >= bad
->ack
)
729 if (BB_END(p
[prev
]) <= (bad
->start
+ bad
->len
)) {
730 len
= BB_END(p
[prev
]) - bad
->start
;
731 if (BB_OFFSET(p
[prev
]) == bad
->start
)
738 if (BB_OFFSET(p
[prev
]) == bad
->start
)
742 * prev range will be split into two, beside the overwritten
743 * one, an extra slot needed from bad table.
748 if ((bb
->count
+ (*extra
)) >= MAX_BADBLOCKS
)
755 * Do the overwrite from the range indicated by 'bad' to the bad range
756 * (from bad table) indexed by 'prev'.
757 * The previously called can_front_overwrite() will provide how many
758 * extra bad range(s) might be split and added into the bad table. All
759 * the splitting cases in the bad table will be handled here.
761 static int front_overwrite(struct badblocks
*bb
, int prev
,
762 struct badblocks_context
*bad
, int extra
)
765 sector_t orig_end
= BB_END(p
[prev
]);
766 int orig_ack
= BB_ACK(p
[prev
]);
770 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]), BB_LEN(p
[prev
]),
774 if (BB_OFFSET(p
[prev
]) == bad
->start
) {
775 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
777 memmove(p
+ prev
+ 2, p
+ prev
+ 1,
778 (bb
->count
- prev
- 1) * 8);
779 p
[prev
+ 1] = BB_MAKE(bad
->start
+ bad
->len
,
780 orig_end
- BB_END(p
[prev
]),
783 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
784 bad
->start
- BB_OFFSET(p
[prev
]),
787 * prev +2 -> prev + 1 + 1, which is for,
788 * 1) prev + 1: the slot index of the previous one
789 * 2) + 1: one more slot for extra being 1.
791 memmove(p
+ prev
+ 2, p
+ prev
+ 1,
792 (bb
->count
- prev
- 1) * 8);
793 p
[prev
+ 1] = BB_MAKE(bad
->start
, bad
->len
, bad
->ack
);
797 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
798 bad
->start
- BB_OFFSET(p
[prev
]),
801 * prev + 3 -> prev + 1 + 2, which is for,
802 * 1) prev + 1: the slot index of the previous one
803 * 2) + 2: two more slots for extra being 2.
805 memmove(p
+ prev
+ 3, p
+ prev
+ 1,
806 (bb
->count
- prev
- 1) * 8);
807 p
[prev
+ 1] = BB_MAKE(bad
->start
, bad
->len
, bad
->ack
);
808 p
[prev
+ 2] = BB_MAKE(BB_END(p
[prev
+ 1]),
809 orig_end
- BB_END(p
[prev
+ 1]),
820 * Explicitly insert a range indicated by 'bad' to the bad table, where
821 * the location is indexed by 'at'.
823 static int insert_at(struct badblocks
*bb
, int at
, struct badblocks_context
*bad
)
828 WARN_ON(badblocks_full(bb
));
830 len
= min_t(sector_t
, bad
->len
, BB_MAX_LEN
);
832 memmove(p
+ at
+ 1, p
+ at
, (bb
->count
- at
) * 8);
833 p
[at
] = BB_MAKE(bad
->start
, len
, bad
->ack
);
838 static void badblocks_update_acked(struct badblocks
*bb
)
840 bool unacked
= false;
844 if (!bb
->unacked_exist
)
847 for (i
= 0; i
< bb
->count
; i
++) {
855 bb
->unacked_exist
= 0;
858 /* Do exact work to set bad block range into the bad block table */
859 static int _badblocks_set(struct badblocks
*bb
, sector_t s
, int sectors
,
862 int retried
= 0, space_desired
= 0;
863 int orig_len
, len
= 0, added
= 0;
864 struct badblocks_context bad
;
865 int prev
= -1, hint
= -1;
872 /* badblocks are disabled */
876 /* Invalid sectors number */
880 /* round the start down, and the end up */
881 sector_t next
= s
+ sectors
;
883 rounddown(s
, bb
->shift
);
884 roundup(next
, bb
->shift
);
888 write_seqlock_irqsave(&bb
->lock
, flags
);
892 bad
.ack
= acknowledged
;
900 if (badblocks_empty(bb
)) {
901 len
= insert_at(bb
, 0, &bad
);
907 prev
= prev_badblocks(bb
, &bad
, hint
);
909 /* start before all badblocks */
911 if (!badblocks_full(bb
)) {
912 /* insert on the first */
913 if (bad
.len
> (BB_OFFSET(p
[0]) - bad
.start
))
914 bad
.len
= BB_OFFSET(p
[0]) - bad
.start
;
915 len
= insert_at(bb
, 0, &bad
);
922 /* No sapce, try to merge */
923 if (overlap_behind(bb
, &bad
, 0)) {
924 if (can_merge_behind(bb
, &bad
, 0)) {
925 len
= behind_merge(bb
, &bad
, 0);
928 len
= BB_OFFSET(p
[0]) - s
;
935 /* no table space and give up */
939 /* in case p[prev-1] can be merged with p[prev] */
940 if (can_combine_front(bb
, prev
, &bad
)) {
941 front_combine(bb
, prev
);
948 if (overlap_front(bb
, prev
, &bad
)) {
949 if (can_merge_front(bb
, prev
, &bad
)) {
950 len
= front_merge(bb
, prev
, &bad
);
955 if (!can_front_overwrite(bb
, prev
, &bad
, &extra
)) {
956 len
= min_t(sector_t
,
957 BB_END(p
[prev
]) - s
, sectors
);
962 len
= front_overwrite(bb
, prev
, &bad
, extra
);
966 if (can_combine_front(bb
, prev
, &bad
)) {
967 front_combine(bb
, prev
);
975 if (can_merge_front(bb
, prev
, &bad
)) {
976 len
= front_merge(bb
, prev
, &bad
);
982 /* if no space in table, still try to merge in the covered range */
983 if (badblocks_full(bb
)) {
984 /* skip the cannot-merge range */
985 if (((prev
+ 1) < bb
->count
) &&
986 overlap_behind(bb
, &bad
, prev
+ 1) &&
987 ((s
+ sectors
) >= BB_END(p
[prev
+ 1]))) {
988 len
= BB_END(p
[prev
+ 1]) - s
;
993 /* no retry any more */
1000 /* cannot merge and there is space in bad table */
1001 if ((prev
+ 1) < bb
->count
&&
1002 overlap_behind(bb
, &bad
, prev
+ 1))
1003 bad
.len
= min_t(sector_t
,
1004 bad
.len
, BB_OFFSET(p
[prev
+ 1]) - bad
.start
);
1006 len
= insert_at(bb
, prev
+ 1, &bad
);
1018 WARN_ON(sectors
< 0);
1021 * Check whether the following already set range can be
1022 * merged. (prev < 0) condition is not handled here,
1023 * because it's already complicated enough.
1026 (prev
+ 1) < bb
->count
&&
1027 BB_END(p
[prev
]) == BB_OFFSET(p
[prev
+ 1]) &&
1028 (BB_LEN(p
[prev
]) + BB_LEN(p
[prev
+ 1])) <= BB_MAX_LEN
&&
1029 BB_ACK(p
[prev
]) == BB_ACK(p
[prev
+ 1])) {
1030 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
1031 BB_LEN(p
[prev
]) + BB_LEN(p
[prev
+ 1]),
1034 if ((prev
+ 2) < bb
->count
)
1035 memmove(p
+ prev
+ 1, p
+ prev
+ 2,
1036 (bb
->count
- (prev
+ 2)) * 8);
1040 if (space_desired
&& !badblocks_full(bb
)) {
1053 bb
->unacked_exist
= 1;
1055 badblocks_update_acked(bb
);
1058 write_sequnlock_irqrestore(&bb
->lock
, flags
);
1067 * Clear the bad block range from bad block table which is front overlapped
1068 * with the clearing range. The return value is how many sectors from an
1069 * already set bad block range are cleared. If the whole bad block range is
1070 * covered by the clearing range and fully cleared, 'delete' is set as 1 for
1071 * the caller to reduce bb->count.
1073 static int front_clear(struct badblocks
*bb
, int prev
,
1074 struct badblocks_context
*bad
, int *deleted
)
1076 sector_t sectors
= bad
->len
;
1077 sector_t s
= bad
->start
;
1082 if (s
== BB_OFFSET(p
[prev
])) {
1083 if (BB_LEN(p
[prev
]) > sectors
) {
1084 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]) + sectors
,
1085 BB_LEN(p
[prev
]) - sectors
,
1089 /* BB_LEN(p[prev]) <= sectors */
1090 cleared
= BB_LEN(p
[prev
]);
1091 if ((prev
+ 1) < bb
->count
)
1092 memmove(p
+ prev
, p
+ prev
+ 1,
1093 (bb
->count
- prev
- 1) * 8);
1096 } else if (s
> BB_OFFSET(p
[prev
])) {
1097 if (BB_END(p
[prev
]) <= (s
+ sectors
)) {
1098 cleared
= BB_END(p
[prev
]) - s
;
1099 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
1100 s
- BB_OFFSET(p
[prev
]),
1103 /* Splitting is handled in front_splitting_clear() */
1112 * Handle the condition that the clearing range hits middle of an already set
1113 * bad block range from bad block table. In this condition the existing bad
1114 * block range is split into two after the middle part is cleared.
1116 static int front_splitting_clear(struct badblocks
*bb
, int prev
,
1117 struct badblocks_context
*bad
)
1120 u64 end
= BB_END(p
[prev
]);
1121 int ack
= BB_ACK(p
[prev
]);
1122 sector_t sectors
= bad
->len
;
1123 sector_t s
= bad
->start
;
1125 p
[prev
] = BB_MAKE(BB_OFFSET(p
[prev
]),
1126 s
- BB_OFFSET(p
[prev
]),
1128 memmove(p
+ prev
+ 2, p
+ prev
+ 1, (bb
->count
- prev
- 1) * 8);
1129 p
[prev
+ 1] = BB_MAKE(s
+ sectors
, end
- s
- sectors
, ack
);
1133 /* Do the exact work to clear bad block range from the bad block table */
1134 static int _badblocks_clear(struct badblocks
*bb
, sector_t s
, int sectors
)
1136 struct badblocks_context bad
;
1137 int prev
= -1, hint
= -1;
1138 int len
= 0, cleared
= 0;
1143 /* badblocks are disabled */
1147 /* Invalid sectors number */
1153 /* When clearing we round the start up and the end down.
1154 * This should not matter as the shift should align with
1155 * the block size and no rounding should ever be needed.
1156 * However it is better the think a block is bad when it
1157 * isn't than to think a block is not bad when it is.
1159 target
= s
+ sectors
;
1160 roundup(s
, bb
->shift
);
1161 rounddown(target
, bb
->shift
);
1162 sectors
= target
- s
;
1165 write_seqlock_irq(&bb
->lock
);
1174 if (badblocks_empty(bb
)) {
1177 goto update_sectors
;
1181 prev
= prev_badblocks(bb
, &bad
, hint
);
1183 /* Start before all badblocks */
1185 if (overlap_behind(bb
, &bad
, 0)) {
1186 len
= BB_OFFSET(p
[0]) - s
;
1192 * Both situations are to clear non-bad range,
1193 * should be treated as successful
1196 goto update_sectors
;
1199 /* Start after all badblocks */
1200 if ((prev
+ 1) >= bb
->count
&& !overlap_front(bb
, prev
, &bad
)) {
1203 goto update_sectors
;
1206 /* Clear will split a bad record but the table is full */
1207 if (badblocks_full(bb
) && (BB_OFFSET(p
[prev
]) < bad
.start
) &&
1208 (BB_END(p
[prev
]) > (bad
.start
+ sectors
))) {
1210 goto update_sectors
;
1213 if (overlap_front(bb
, prev
, &bad
)) {
1214 if ((BB_OFFSET(p
[prev
]) < bad
.start
) &&
1215 (BB_END(p
[prev
]) > (bad
.start
+ bad
.len
))) {
1217 if ((bb
->count
+ 1) < MAX_BADBLOCKS
) {
1218 len
= front_splitting_clear(bb
, prev
, &bad
);
1222 /* No space to split, give up */
1228 len
= front_clear(bb
, prev
, &bad
, &deleted
);
1229 bb
->count
-= deleted
;
1234 goto update_sectors
;
1237 /* Not front overlap, but behind overlap */
1238 if ((prev
+ 1) < bb
->count
&& overlap_behind(bb
, &bad
, prev
+ 1)) {
1239 len
= BB_OFFSET(p
[prev
+ 1]) - bad
.start
;
1241 /* Clear non-bad range should be treated as successful */
1243 goto update_sectors
;
1246 /* Not cover any badblocks range in the table */
1248 /* Clear non-bad range should be treated as successful */
1258 WARN_ON(sectors
< 0);
1261 badblocks_update_acked(bb
);
1265 write_sequnlock_irq(&bb
->lock
);
1273 /* Do the exact work to check bad blocks range from the bad block table */
1274 static int _badblocks_check(struct badblocks
*bb
, sector_t s
, int sectors
,
1275 sector_t
*first_bad
, int *bad_sectors
)
1277 int unacked_badblocks
, acked_badblocks
;
1278 int prev
= -1, hint
= -1, set
= 0;
1279 struct badblocks_context bad
;
1284 WARN_ON(bb
->shift
< 0 || sectors
== 0);
1286 if (bb
->shift
> 0) {
1289 /* round the start down, and the end up */
1290 target
= s
+ sectors
;
1291 rounddown(s
, bb
->shift
);
1292 roundup(target
, bb
->shift
);
1293 sectors
= target
- s
;
1297 seq
= read_seqbegin(&bb
->lock
);
1300 unacked_badblocks
= 0;
1301 acked_badblocks
= 0;
1307 if (badblocks_empty(bb
)) {
1309 goto update_sectors
;
1312 prev
= prev_badblocks(bb
, &bad
, hint
);
1314 /* start after all badblocks */
1316 ((prev
+ 1) >= bb
->count
) && !overlap_front(bb
, prev
, &bad
)) {
1318 goto update_sectors
;
1321 /* Overlapped with front badblocks record */
1322 if ((prev
>= 0) && overlap_front(bb
, prev
, &bad
)) {
1323 if (BB_ACK(p
[prev
]))
1326 unacked_badblocks
++;
1328 if (BB_END(p
[prev
]) >= (s
+ sectors
))
1331 len
= BB_END(p
[prev
]) - s
;
1334 *first_bad
= BB_OFFSET(p
[prev
]);
1335 *bad_sectors
= BB_LEN(p
[prev
]);
1338 goto update_sectors
;
1341 /* Not front overlap, but behind overlap */
1342 if ((prev
+ 1) < bb
->count
&& overlap_behind(bb
, &bad
, prev
+ 1)) {
1343 len
= BB_OFFSET(p
[prev
+ 1]) - bad
.start
;
1345 goto update_sectors
;
1348 /* not cover any badblocks range in the table */
1358 WARN_ON(sectors
< 0);
1360 if (unacked_badblocks
> 0)
1362 else if (acked_badblocks
> 0)
1367 if (read_seqretry(&bb
->lock
, seq
))
1374 * badblocks_check() - check a given range for bad sectors
1375 * @bb: the badblocks structure that holds all badblock information
1376 * @s: sector (start) at which to check for badblocks
1377 * @sectors: number of sectors to check for badblocks
1378 * @first_bad: pointer to store location of the first badblock
1379 * @bad_sectors: pointer to store number of badblocks after @first_bad
1381 * We can record which blocks on each device are 'bad' and so just
1382 * fail those blocks, or that stripe, rather than the whole device.
1383 * Entries in the bad-block table are 64bits wide. This comprises:
1384 * Length of bad-range, in sectors: 0-511 for lengths 1-512
1385 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
1386 * A 'shift' can be set so that larger blocks are tracked and
1387 * consequently larger devices can be covered.
1388 * 'Acknowledged' flag - 1 bit. - the most significant bit.
1390 * Locking of the bad-block table uses a seqlock so badblocks_check
1391 * might need to retry if it is very unlucky.
1392 * We will sometimes want to check for bad blocks in a bi_end_io function,
1393 * so we use the write_seqlock_irq variant.
1395 * When looking for a bad block we specify a range and want to
1396 * know if any block in the range is bad. So we binary-search
1397 * to the last range that starts at-or-before the given endpoint,
1398 * (or "before the sector after the target range")
1399 * then see if it ends after the given start.
1402 * 0: there are no known bad blocks in the range
1403 * 1: there are known bad block which are all acknowledged
1404 * -1: there are bad blocks which have not yet been acknowledged in metadata.
1405 * plus the start/length of the first bad section we overlap.
1407 int badblocks_check(struct badblocks
*bb
, sector_t s
, int sectors
,
1408 sector_t
*first_bad
, int *bad_sectors
)
1410 return _badblocks_check(bb
, s
, sectors
, first_bad
, bad_sectors
);
1412 EXPORT_SYMBOL_GPL(badblocks_check
);
1415 * badblocks_set() - Add a range of bad blocks to the table.
1416 * @bb: the badblocks structure that holds all badblock information
1417 * @s: first sector to mark as bad
1418 * @sectors: number of sectors to mark as bad
1419 * @acknowledged: weather to mark the bad sectors as acknowledged
1421 * This might extend the table, or might contract it if two adjacent ranges
1422 * can be merged. We binary-search to find the 'insertion' point, then
1423 * decide how best to handle it.
1427 * 1: failed to set badblocks (out of space)
1429 int badblocks_set(struct badblocks
*bb
, sector_t s
, int sectors
,
1432 return _badblocks_set(bb
, s
, sectors
, acknowledged
);
1434 EXPORT_SYMBOL_GPL(badblocks_set
);
1437 * badblocks_clear() - Remove a range of bad blocks to the table.
1438 * @bb: the badblocks structure that holds all badblock information
1439 * @s: first sector to mark as bad
1440 * @sectors: number of sectors to mark as bad
1442 * This may involve extending the table if we spilt a region,
1443 * but it must not fail. So if the table becomes full, we just
1444 * drop the remove request.
1448 * 1: failed to clear badblocks
1450 int badblocks_clear(struct badblocks
*bb
, sector_t s
, int sectors
)
1452 return _badblocks_clear(bb
, s
, sectors
);
1454 EXPORT_SYMBOL_GPL(badblocks_clear
);
1457 * ack_all_badblocks() - Acknowledge all bad blocks in a list.
1458 * @bb: the badblocks structure that holds all badblock information
1460 * This only succeeds if ->changed is clear. It is used by
1461 * in-kernel metadata updates
1463 void ack_all_badblocks(struct badblocks
*bb
)
1465 if (bb
->page
== NULL
|| bb
->changed
)
1466 /* no point even trying */
1468 write_seqlock_irq(&bb
->lock
);
1470 if (bb
->changed
== 0 && bb
->unacked_exist
) {
1474 for (i
= 0; i
< bb
->count
; i
++) {
1475 if (!BB_ACK(p
[i
])) {
1476 sector_t start
= BB_OFFSET(p
[i
]);
1477 int len
= BB_LEN(p
[i
]);
1479 p
[i
] = BB_MAKE(start
, len
, 1);
1482 bb
->unacked_exist
= 0;
1484 write_sequnlock_irq(&bb
->lock
);
1486 EXPORT_SYMBOL_GPL(ack_all_badblocks
);
1489 * badblocks_show() - sysfs access to bad-blocks list
1490 * @bb: the badblocks structure that holds all badblock information
1491 * @page: buffer received from sysfs
1492 * @unack: weather to show unacknowledged badblocks
1495 * Length of returned data
1497 ssize_t
badblocks_show(struct badblocks
*bb
, char *page
, int unack
)
1508 seq
= read_seqbegin(&bb
->lock
);
1513 while (len
< PAGE_SIZE
&& i
< bb
->count
) {
1514 sector_t s
= BB_OFFSET(p
[i
]);
1515 unsigned int length
= BB_LEN(p
[i
]);
1516 int ack
= BB_ACK(p
[i
]);
1523 len
+= snprintf(page
+len
, PAGE_SIZE
-len
, "%llu %u\n",
1524 (unsigned long long)s
<< bb
->shift
,
1525 length
<< bb
->shift
);
1527 if (unack
&& len
== 0)
1528 bb
->unacked_exist
= 0;
1530 if (read_seqretry(&bb
->lock
, seq
))
1535 EXPORT_SYMBOL_GPL(badblocks_show
);
1538 * badblocks_store() - sysfs access to bad-blocks list
1539 * @bb: the badblocks structure that holds all badblock information
1540 * @page: buffer received from sysfs
1541 * @len: length of data received from sysfs
1542 * @unack: weather to show unacknowledged badblocks
1545 * Length of the buffer processed or -ve error.
1547 ssize_t
badblocks_store(struct badblocks
*bb
, const char *page
, size_t len
,
1550 unsigned long long sector
;
1554 switch (sscanf(page
, "%llu %d%c", §or
, &length
, &newline
)) {
1556 if (newline
!= '\n')
1567 if (badblocks_set(bb
, sector
, length
, !unack
))
1572 EXPORT_SYMBOL_GPL(badblocks_store
);
1574 static int __badblocks_init(struct device
*dev
, struct badblocks
*bb
,
1584 bb
->page
= devm_kzalloc(dev
, PAGE_SIZE
, GFP_KERNEL
);
1586 bb
->page
= kzalloc(PAGE_SIZE
, GFP_KERNEL
);
1591 seqlock_init(&bb
->lock
);
1597 * badblocks_init() - initialize the badblocks structure
1598 * @bb: the badblocks structure that holds all badblock information
1599 * @enable: weather to enable badblocks accounting
1603 * -ve errno: on error
1605 int badblocks_init(struct badblocks
*bb
, int enable
)
1607 return __badblocks_init(NULL
, bb
, enable
);
1609 EXPORT_SYMBOL_GPL(badblocks_init
);
1611 int devm_init_badblocks(struct device
*dev
, struct badblocks
*bb
)
1615 return __badblocks_init(dev
, bb
, 1);
1617 EXPORT_SYMBOL_GPL(devm_init_badblocks
);
1620 * badblocks_exit() - free the badblocks structure
1621 * @bb: the badblocks structure that holds all badblock information
1623 void badblocks_exit(struct badblocks
*bb
)
1628 devm_kfree(bb
->dev
, bb
->page
);
1633 EXPORT_SYMBOL_GPL(badblocks_exit
);