1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
4 * Copyright (C) 2016 CNEX Labs
5 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
6 * Matias Bjorling <matias@cnexlabs.com>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version
10 * 2 as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * Implementation of a physical block-device target for Open-channel SSDs.
19 * pblk-init.c - pblk's initialization.
23 #include "pblk-trace.h"
25 static unsigned int write_buffer_size
;
27 module_param(write_buffer_size
, uint
, 0644);
28 MODULE_PARM_DESC(write_buffer_size
, "number of entries in a write buffer");
30 struct pblk_global_caches
{
31 struct kmem_cache
*ws
;
32 struct kmem_cache
*rec
;
33 struct kmem_cache
*g_rq
;
34 struct kmem_cache
*w_rq
;
38 struct mutex mutex
; /* Ensures consistency between
43 static struct pblk_global_caches pblk_caches
= {
44 .mutex
= __MUTEX_INITIALIZER(pblk_caches
.mutex
),
48 struct bio_set pblk_bio_set
;
50 static blk_qc_t
pblk_submit_bio(struct bio
*bio
)
52 struct pblk
*pblk
= bio
->bi_disk
->queue
->queuedata
;
54 if (bio_op(bio
) == REQ_OP_DISCARD
) {
55 pblk_discard(pblk
, bio
);
56 if (!(bio
->bi_opf
& REQ_PREFLUSH
)) {
62 /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
63 * constraint. Writes can be of arbitrary size.
65 if (bio_data_dir(bio
) == READ
) {
66 blk_queue_split(&bio
);
67 pblk_submit_read(pblk
, bio
);
69 /* Prevent deadlock in the case of a modest LUN configuration
70 * and large user I/Os. Unless stalled, the rate limiter
71 * leaves at least 256KB available for user I/O.
73 if (pblk_get_secs(bio
) > pblk_rl_max_io(&pblk
->rl
))
74 blk_queue_split(&bio
);
76 pblk_write_to_cache(pblk
, bio
, PBLK_IOTYPE_USER
);
82 static const struct block_device_operations pblk_bops
= {
84 .submit_bio
= pblk_submit_bio
,
88 static size_t pblk_trans_map_size(struct pblk
*pblk
)
92 if (pblk
->addrf_len
< 32)
95 return entry_size
* pblk
->capacity
;
98 #ifdef CONFIG_NVM_PBLK_DEBUG
99 static u32
pblk_l2p_crc(struct pblk
*pblk
)
104 map_size
= pblk_trans_map_size(pblk
);
105 crc
= crc32_le(crc
, pblk
->trans_map
, map_size
);
110 static void pblk_l2p_free(struct pblk
*pblk
)
112 vfree(pblk
->trans_map
);
115 static int pblk_l2p_recover(struct pblk
*pblk
, bool factory_init
)
117 struct pblk_line
*line
= NULL
;
120 guid_gen(&pblk
->instance_uuid
);
122 line
= pblk_recov_l2p(pblk
);
124 pblk_err(pblk
, "could not recover l2p table\n");
129 #ifdef CONFIG_NVM_PBLK_DEBUG
130 pblk_info(pblk
, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
133 /* Free full lines directly as GC has not been started yet */
134 pblk_gc_free_full_lines(pblk
);
137 /* Configure next line for user data */
138 line
= pblk_line_get_first_data(pblk
);
146 static int pblk_l2p_init(struct pblk
*pblk
, bool factory_init
)
153 map_size
= pblk_trans_map_size(pblk
);
154 pblk
->trans_map
= __vmalloc(map_size
, GFP_KERNEL
| __GFP_NOWARN
|
155 __GFP_RETRY_MAYFAIL
| __GFP_HIGHMEM
);
156 if (!pblk
->trans_map
) {
157 pblk_err(pblk
, "failed to allocate L2P (need %zu of memory)\n",
162 pblk_ppa_set_empty(&ppa
);
164 for (i
= 0; i
< pblk
->capacity
; i
++)
165 pblk_trans_map_set(pblk
, i
, ppa
);
167 ret
= pblk_l2p_recover(pblk
, factory_init
);
169 vfree(pblk
->trans_map
);
174 static void pblk_rwb_free(struct pblk
*pblk
)
176 if (pblk_rb_tear_down_check(&pblk
->rwb
))
177 pblk_err(pblk
, "write buffer error on tear down\n");
179 pblk_rb_free(&pblk
->rwb
);
182 static int pblk_rwb_init(struct pblk
*pblk
)
184 struct nvm_tgt_dev
*dev
= pblk
->dev
;
185 struct nvm_geo
*geo
= &dev
->geo
;
186 unsigned long buffer_size
;
187 int pgs_in_buffer
, threshold
;
189 threshold
= geo
->mw_cunits
* geo
->all_luns
;
190 pgs_in_buffer
= (max(geo
->mw_cunits
, geo
->ws_opt
) + geo
->ws_opt
)
193 if (write_buffer_size
&& (write_buffer_size
> pgs_in_buffer
))
194 buffer_size
= write_buffer_size
;
196 buffer_size
= pgs_in_buffer
;
198 return pblk_rb_init(&pblk
->rwb
, buffer_size
, threshold
, geo
->csecs
);
201 static int pblk_set_addrf_12(struct pblk
*pblk
, struct nvm_geo
*geo
,
202 struct nvm_addrf_12
*dst
)
204 struct nvm_addrf_12
*src
= (struct nvm_addrf_12
*)&geo
->addrf
;
207 /* Re-calculate channel and lun format to adapt to configuration */
208 power_len
= get_count_order(geo
->num_ch
);
209 if (1 << power_len
!= geo
->num_ch
) {
210 pblk_err(pblk
, "supports only power-of-two channel config.\n");
213 dst
->ch_len
= power_len
;
215 power_len
= get_count_order(geo
->num_lun
);
216 if (1 << power_len
!= geo
->num_lun
) {
217 pblk_err(pblk
, "supports only power-of-two LUN config.\n");
220 dst
->lun_len
= power_len
;
222 dst
->blk_len
= src
->blk_len
;
223 dst
->pg_len
= src
->pg_len
;
224 dst
->pln_len
= src
->pln_len
;
225 dst
->sec_len
= src
->sec_len
;
228 dst
->pln_offset
= dst
->sec_len
;
229 dst
->ch_offset
= dst
->pln_offset
+ dst
->pln_len
;
230 dst
->lun_offset
= dst
->ch_offset
+ dst
->ch_len
;
231 dst
->pg_offset
= dst
->lun_offset
+ dst
->lun_len
;
232 dst
->blk_offset
= dst
->pg_offset
+ dst
->pg_len
;
234 dst
->sec_mask
= ((1ULL << dst
->sec_len
) - 1) << dst
->sec_offset
;
235 dst
->pln_mask
= ((1ULL << dst
->pln_len
) - 1) << dst
->pln_offset
;
236 dst
->ch_mask
= ((1ULL << dst
->ch_len
) - 1) << dst
->ch_offset
;
237 dst
->lun_mask
= ((1ULL << dst
->lun_len
) - 1) << dst
->lun_offset
;
238 dst
->pg_mask
= ((1ULL << dst
->pg_len
) - 1) << dst
->pg_offset
;
239 dst
->blk_mask
= ((1ULL << dst
->blk_len
) - 1) << dst
->blk_offset
;
241 return dst
->blk_offset
+ src
->blk_len
;
244 static int pblk_set_addrf_20(struct nvm_geo
*geo
, struct nvm_addrf
*adst
,
245 struct pblk_addrf
*udst
)
247 struct nvm_addrf
*src
= &geo
->addrf
;
249 adst
->ch_len
= get_count_order(geo
->num_ch
);
250 adst
->lun_len
= get_count_order(geo
->num_lun
);
251 adst
->chk_len
= src
->chk_len
;
252 adst
->sec_len
= src
->sec_len
;
254 adst
->sec_offset
= 0;
255 adst
->ch_offset
= adst
->sec_len
;
256 adst
->lun_offset
= adst
->ch_offset
+ adst
->ch_len
;
257 adst
->chk_offset
= adst
->lun_offset
+ adst
->lun_len
;
259 adst
->sec_mask
= ((1ULL << adst
->sec_len
) - 1) << adst
->sec_offset
;
260 adst
->chk_mask
= ((1ULL << adst
->chk_len
) - 1) << adst
->chk_offset
;
261 adst
->lun_mask
= ((1ULL << adst
->lun_len
) - 1) << adst
->lun_offset
;
262 adst
->ch_mask
= ((1ULL << adst
->ch_len
) - 1) << adst
->ch_offset
;
264 udst
->sec_stripe
= geo
->ws_opt
;
265 udst
->ch_stripe
= geo
->num_ch
;
266 udst
->lun_stripe
= geo
->num_lun
;
268 udst
->sec_lun_stripe
= udst
->sec_stripe
* udst
->ch_stripe
;
269 udst
->sec_ws_stripe
= udst
->sec_lun_stripe
* udst
->lun_stripe
;
271 return adst
->chk_offset
+ adst
->chk_len
;
274 static int pblk_set_addrf(struct pblk
*pblk
)
276 struct nvm_tgt_dev
*dev
= pblk
->dev
;
277 struct nvm_geo
*geo
= &dev
->geo
;
280 switch (geo
->version
) {
281 case NVM_OCSSD_SPEC_12
:
282 div_u64_rem(geo
->clba
, pblk
->min_write_pgs
, &mod
);
284 pblk_err(pblk
, "bad configuration of sectors/pages\n");
288 pblk
->addrf_len
= pblk_set_addrf_12(pblk
, geo
,
289 (void *)&pblk
->addrf
);
291 case NVM_OCSSD_SPEC_20
:
292 pblk
->addrf_len
= pblk_set_addrf_20(geo
, (void *)&pblk
->addrf
,
296 pblk_err(pblk
, "OCSSD revision not supported (%d)\n",
304 static int pblk_create_global_caches(void)
307 pblk_caches
.ws
= kmem_cache_create("pblk_blk_ws",
308 sizeof(struct pblk_line_ws
), 0, 0, NULL
);
312 pblk_caches
.rec
= kmem_cache_create("pblk_rec",
313 sizeof(struct pblk_rec_ctx
), 0, 0, NULL
);
314 if (!pblk_caches
.rec
)
315 goto fail_destroy_ws
;
317 pblk_caches
.g_rq
= kmem_cache_create("pblk_g_rq", pblk_g_rq_size
,
319 if (!pblk_caches
.g_rq
)
320 goto fail_destroy_rec
;
322 pblk_caches
.w_rq
= kmem_cache_create("pblk_w_rq", pblk_w_rq_size
,
324 if (!pblk_caches
.w_rq
)
325 goto fail_destroy_g_rq
;
330 kmem_cache_destroy(pblk_caches
.g_rq
);
332 kmem_cache_destroy(pblk_caches
.rec
);
334 kmem_cache_destroy(pblk_caches
.ws
);
339 static int pblk_get_global_caches(void)
343 mutex_lock(&pblk_caches
.mutex
);
345 if (kref_get_unless_zero(&pblk_caches
.kref
))
348 ret
= pblk_create_global_caches();
350 kref_init(&pblk_caches
.kref
);
353 mutex_unlock(&pblk_caches
.mutex
);
357 static void pblk_destroy_global_caches(struct kref
*ref
)
359 struct pblk_global_caches
*c
;
361 c
= container_of(ref
, struct pblk_global_caches
, kref
);
363 kmem_cache_destroy(c
->ws
);
364 kmem_cache_destroy(c
->rec
);
365 kmem_cache_destroy(c
->g_rq
);
366 kmem_cache_destroy(c
->w_rq
);
369 static void pblk_put_global_caches(void)
371 mutex_lock(&pblk_caches
.mutex
);
372 kref_put(&pblk_caches
.kref
, pblk_destroy_global_caches
);
373 mutex_unlock(&pblk_caches
.mutex
);
376 static int pblk_core_init(struct pblk
*pblk
)
378 struct nvm_tgt_dev
*dev
= pblk
->dev
;
379 struct nvm_geo
*geo
= &dev
->geo
;
380 int ret
, max_write_ppas
;
382 atomic64_set(&pblk
->user_wa
, 0);
383 atomic64_set(&pblk
->pad_wa
, 0);
384 atomic64_set(&pblk
->gc_wa
, 0);
385 pblk
->user_rst_wa
= 0;
386 pblk
->pad_rst_wa
= 0;
389 atomic64_set(&pblk
->nr_flush
, 0);
390 pblk
->nr_flush_rst
= 0;
392 pblk
->min_write_pgs
= geo
->ws_opt
;
393 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
;
394 max_write_ppas
= pblk
->min_write_pgs
* geo
->all_luns
;
395 pblk
->max_write_pgs
= min_t(int, max_write_ppas
, NVM_MAX_VLBA
);
396 pblk
->max_write_pgs
= min_t(int, pblk
->max_write_pgs
,
397 queue_max_hw_sectors(dev
->q
) / (geo
->csecs
>> SECTOR_SHIFT
));
398 pblk_set_sec_per_write(pblk
, pblk
->min_write_pgs
);
400 pblk
->oob_meta_size
= geo
->sos
;
401 if (!pblk_is_oob_meta_supported(pblk
)) {
402 /* For drives which does not have OOB metadata feature
403 * in order to support recovery feature we need to use
404 * so called packed metadata. Packed metada will store
405 * the same information as OOB metadata (l2p table mapping,
406 * but in the form of the single page at the end of
407 * every write request.
409 if (pblk
->min_write_pgs
410 * sizeof(struct pblk_sec_meta
) > PAGE_SIZE
) {
411 /* We want to keep all the packed metadata on single
412 * page per write requests. So we need to ensure that
415 * This is more like sanity check, since there is
416 * no device with such a big minimal write size
417 * (above 1 metabytes).
419 pblk_err(pblk
, "Not supported min write size\n");
422 /* For packed meta approach we do some simplification.
423 * On read path we always issue requests which size
424 * equal to max_write_pgs, with all pages filled with
425 * user payload except of last one page which will be
426 * filled with packed metadata.
428 pblk
->max_write_pgs
= pblk
->min_write_pgs
;
429 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
- 1;
432 pblk
->pad_dist
= kcalloc(pblk
->min_write_pgs
- 1, sizeof(atomic64_t
),
437 if (pblk_get_global_caches())
438 goto fail_free_pad_dist
;
440 /* Internal bios can be at most the sectors signaled by the device. */
441 ret
= mempool_init_page_pool(&pblk
->page_bio_pool
, NVM_MAX_VLBA
, 0);
443 goto free_global_caches
;
445 ret
= mempool_init_slab_pool(&pblk
->gen_ws_pool
, PBLK_GEN_WS_POOL_SIZE
,
448 goto free_page_bio_pool
;
450 ret
= mempool_init_slab_pool(&pblk
->rec_pool
, geo
->all_luns
,
453 goto free_gen_ws_pool
;
455 ret
= mempool_init_slab_pool(&pblk
->r_rq_pool
, geo
->all_luns
,
460 ret
= mempool_init_slab_pool(&pblk
->e_rq_pool
, geo
->all_luns
,
465 ret
= mempool_init_slab_pool(&pblk
->w_rq_pool
, geo
->all_luns
,
470 pblk
->close_wq
= alloc_workqueue("pblk-close-wq",
471 WQ_MEM_RECLAIM
| WQ_UNBOUND
, PBLK_NR_CLOSE_JOBS
);
475 pblk
->bb_wq
= alloc_workqueue("pblk-bb-wq",
476 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
480 pblk
->r_end_wq
= alloc_workqueue("pblk-read-end-wq",
481 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
485 if (pblk_set_addrf(pblk
))
488 INIT_LIST_HEAD(&pblk
->compl_list
);
489 INIT_LIST_HEAD(&pblk
->resubmit_list
);
494 destroy_workqueue(pblk
->r_end_wq
);
496 destroy_workqueue(pblk
->bb_wq
);
498 destroy_workqueue(pblk
->close_wq
);
500 mempool_exit(&pblk
->w_rq_pool
);
502 mempool_exit(&pblk
->e_rq_pool
);
504 mempool_exit(&pblk
->r_rq_pool
);
506 mempool_exit(&pblk
->rec_pool
);
508 mempool_exit(&pblk
->gen_ws_pool
);
510 mempool_exit(&pblk
->page_bio_pool
);
512 pblk_put_global_caches();
514 kfree(pblk
->pad_dist
);
518 static void pblk_core_free(struct pblk
*pblk
)
521 destroy_workqueue(pblk
->close_wq
);
524 destroy_workqueue(pblk
->r_end_wq
);
527 destroy_workqueue(pblk
->bb_wq
);
529 mempool_exit(&pblk
->page_bio_pool
);
530 mempool_exit(&pblk
->gen_ws_pool
);
531 mempool_exit(&pblk
->rec_pool
);
532 mempool_exit(&pblk
->r_rq_pool
);
533 mempool_exit(&pblk
->e_rq_pool
);
534 mempool_exit(&pblk
->w_rq_pool
);
536 pblk_put_global_caches();
537 kfree(pblk
->pad_dist
);
540 static void pblk_line_mg_free(struct pblk
*pblk
)
542 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
545 kfree(l_mg
->bb_template
);
547 kfree(l_mg
->vsc_list
);
549 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
550 kfree(l_mg
->sline_meta
[i
]);
551 kvfree(l_mg
->eline_meta
[i
]->buf
);
552 kfree(l_mg
->eline_meta
[i
]);
555 mempool_destroy(l_mg
->bitmap_pool
);
556 kmem_cache_destroy(l_mg
->bitmap_cache
);
559 static void pblk_line_meta_free(struct pblk_line_mgmt
*l_mg
,
560 struct pblk_line
*line
)
562 struct pblk_w_err_gc
*w_err_gc
= line
->w_err_gc
;
564 kfree(line
->blk_bitmap
);
565 kfree(line
->erase_bitmap
);
568 kvfree(w_err_gc
->lba_list
);
572 static void pblk_lines_free(struct pblk
*pblk
)
574 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
575 struct pblk_line
*line
;
578 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
579 line
= &pblk
->lines
[i
];
581 pblk_line_free(line
);
582 pblk_line_meta_free(l_mg
, line
);
585 pblk_line_mg_free(pblk
);
591 static int pblk_luns_init(struct pblk
*pblk
)
593 struct nvm_tgt_dev
*dev
= pblk
->dev
;
594 struct nvm_geo
*geo
= &dev
->geo
;
595 struct pblk_lun
*rlun
;
598 /* TODO: Implement unbalanced LUN support */
599 if (geo
->num_lun
< 0) {
600 pblk_err(pblk
, "unbalanced LUN config.\n");
604 pblk
->luns
= kcalloc(geo
->all_luns
, sizeof(struct pblk_lun
),
609 for (i
= 0; i
< geo
->all_luns
; i
++) {
610 /* Stripe across channels */
611 int ch
= i
% geo
->num_ch
;
612 int lun_raw
= i
/ geo
->num_ch
;
613 int lunid
= lun_raw
+ ch
* geo
->num_lun
;
615 rlun
= &pblk
->luns
[i
];
616 rlun
->bppa
= dev
->luns
[lunid
];
618 sema_init(&rlun
->wr_sem
, 1);
624 /* See comment over struct line_emeta definition */
625 static unsigned int calc_emeta_len(struct pblk
*pblk
)
627 struct pblk_line_meta
*lm
= &pblk
->lm
;
628 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
629 struct nvm_tgt_dev
*dev
= pblk
->dev
;
630 struct nvm_geo
*geo
= &dev
->geo
;
632 /* Round to sector size so that lba_list starts on its own sector */
633 lm
->emeta_sec
[1] = DIV_ROUND_UP(
634 sizeof(struct line_emeta
) + lm
->blk_bitmap_len
+
635 sizeof(struct wa_counters
), geo
->csecs
);
636 lm
->emeta_len
[1] = lm
->emeta_sec
[1] * geo
->csecs
;
638 /* Round to sector size so that vsc_list starts on its own sector */
639 lm
->dsec_per_line
= lm
->sec_per_line
- lm
->emeta_sec
[0];
640 lm
->emeta_sec
[2] = DIV_ROUND_UP(lm
->dsec_per_line
* sizeof(u64
),
642 lm
->emeta_len
[2] = lm
->emeta_sec
[2] * geo
->csecs
;
644 lm
->emeta_sec
[3] = DIV_ROUND_UP(l_mg
->nr_lines
* sizeof(u32
),
646 lm
->emeta_len
[3] = lm
->emeta_sec
[3] * geo
->csecs
;
648 lm
->vsc_list_len
= l_mg
->nr_lines
* sizeof(u32
);
650 return (lm
->emeta_len
[1] + lm
->emeta_len
[2] + lm
->emeta_len
[3]);
653 static int pblk_set_provision(struct pblk
*pblk
, int nr_free_chks
)
655 struct nvm_tgt_dev
*dev
= pblk
->dev
;
656 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
657 struct pblk_line_meta
*lm
= &pblk
->lm
;
658 struct nvm_geo
*geo
= &dev
->geo
;
659 sector_t provisioned
;
660 int sec_meta
, blk_meta
, clba
;
663 if (geo
->op
== NVM_TARGET_DEFAULT_OP
)
664 pblk
->op
= PBLK_DEFAULT_OP
;
668 minimum
= pblk_get_min_chks(pblk
);
669 provisioned
= nr_free_chks
;
670 provisioned
*= (100 - pblk
->op
);
671 sector_div(provisioned
, 100);
673 if ((nr_free_chks
- provisioned
) < minimum
) {
674 if (geo
->op
!= NVM_TARGET_DEFAULT_OP
) {
675 pblk_err(pblk
, "OP too small to create a sane instance\n");
679 /* If the user did not specify an OP value, and PBLK_DEFAULT_OP
680 * is not enough, calculate and set sane value
683 provisioned
= nr_free_chks
- minimum
;
684 pblk
->op
= (100 * minimum
) / nr_free_chks
;
685 pblk_info(pblk
, "Default OP insufficient, adjusting OP to %d\n",
689 pblk
->op_blks
= nr_free_chks
- provisioned
;
691 /* Internally pblk manages all free blocks, but all calculations based
692 * on user capacity consider only provisioned blocks
694 pblk
->rl
.total_blocks
= nr_free_chks
;
696 /* Consider sectors used for metadata */
697 sec_meta
= (lm
->smeta_sec
+ lm
->emeta_sec
[0]) * l_mg
->nr_free_lines
;
698 blk_meta
= DIV_ROUND_UP(sec_meta
, geo
->clba
);
700 clba
= (geo
->clba
/ pblk
->min_write_pgs
) * pblk
->min_write_pgs_data
;
701 pblk
->capacity
= (provisioned
- blk_meta
) * clba
;
703 atomic_set(&pblk
->rl
.free_blocks
, nr_free_chks
);
704 atomic_set(&pblk
->rl
.free_user_blocks
, nr_free_chks
);
709 static int pblk_setup_line_meta_chk(struct pblk
*pblk
, struct pblk_line
*line
,
710 struct nvm_chk_meta
*meta
)
712 struct nvm_tgt_dev
*dev
= pblk
->dev
;
713 struct nvm_geo
*geo
= &dev
->geo
;
714 struct pblk_line_meta
*lm
= &pblk
->lm
;
715 int i
, nr_bad_chks
= 0;
717 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
718 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
719 struct nvm_chk_meta
*chunk
;
720 struct nvm_chk_meta
*chunk_meta
;
725 pos
= pblk_ppa_to_pos(geo
, ppa
);
726 chunk
= &line
->chks
[pos
];
728 ppa
.m
.chk
= line
->id
;
729 chunk_meta
= pblk_chunk_get_off(pblk
, meta
, ppa
);
731 chunk
->state
= chunk_meta
->state
;
732 chunk
->type
= chunk_meta
->type
;
733 chunk
->wi
= chunk_meta
->wi
;
734 chunk
->slba
= chunk_meta
->slba
;
735 chunk
->cnlb
= chunk_meta
->cnlb
;
736 chunk
->wp
= chunk_meta
->wp
;
738 trace_pblk_chunk_state(pblk_disk_name(pblk
), &ppa
,
741 if (chunk
->type
& NVM_CHK_TP_SZ_SPEC
) {
742 WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
746 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
749 set_bit(pos
, line
->blk_bitmap
);
756 static long pblk_setup_line_meta(struct pblk
*pblk
, struct pblk_line
*line
,
757 void *chunk_meta
, int line_id
)
759 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
760 struct pblk_line_meta
*lm
= &pblk
->lm
;
761 long nr_bad_chks
, chk_in_line
;
765 line
->type
= PBLK_LINETYPE_FREE
;
766 line
->state
= PBLK_LINESTATE_NEW
;
767 line
->gc_group
= PBLK_LINEGC_NONE
;
768 line
->vsc
= &l_mg
->vsc_list
[line_id
];
769 spin_lock_init(&line
->lock
);
771 nr_bad_chks
= pblk_setup_line_meta_chk(pblk
, line
, chunk_meta
);
773 chk_in_line
= lm
->blk_per_line
- nr_bad_chks
;
774 if (nr_bad_chks
< 0 || nr_bad_chks
> lm
->blk_per_line
||
775 chk_in_line
< lm
->min_blk_line
) {
776 line
->state
= PBLK_LINESTATE_BAD
;
777 list_add_tail(&line
->list
, &l_mg
->bad_list
);
781 atomic_set(&line
->blk_in_line
, chk_in_line
);
782 list_add_tail(&line
->list
, &l_mg
->free_list
);
783 l_mg
->nr_free_lines
++;
788 static int pblk_alloc_line_meta(struct pblk
*pblk
, struct pblk_line
*line
)
790 struct pblk_line_meta
*lm
= &pblk
->lm
;
792 line
->blk_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
793 if (!line
->blk_bitmap
)
796 line
->erase_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
797 if (!line
->erase_bitmap
)
798 goto free_blk_bitmap
;
801 line
->chks
= kmalloc_array(lm
->blk_per_line
,
802 sizeof(struct nvm_chk_meta
), GFP_KERNEL
);
804 goto free_erase_bitmap
;
806 line
->w_err_gc
= kzalloc(sizeof(struct pblk_w_err_gc
), GFP_KERNEL
);
815 kfree(line
->erase_bitmap
);
817 kfree(line
->blk_bitmap
);
821 static int pblk_line_mg_init(struct pblk
*pblk
)
823 struct nvm_tgt_dev
*dev
= pblk
->dev
;
824 struct nvm_geo
*geo
= &dev
->geo
;
825 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
826 struct pblk_line_meta
*lm
= &pblk
->lm
;
829 l_mg
->nr_lines
= geo
->num_chk
;
830 l_mg
->log_line
= l_mg
->data_line
= NULL
;
831 l_mg
->l_seq_nr
= l_mg
->d_seq_nr
= 0;
832 l_mg
->nr_free_lines
= 0;
833 bitmap_zero(&l_mg
->meta_bitmap
, PBLK_DATA_LINES
);
835 INIT_LIST_HEAD(&l_mg
->free_list
);
836 INIT_LIST_HEAD(&l_mg
->corrupt_list
);
837 INIT_LIST_HEAD(&l_mg
->bad_list
);
838 INIT_LIST_HEAD(&l_mg
->gc_full_list
);
839 INIT_LIST_HEAD(&l_mg
->gc_high_list
);
840 INIT_LIST_HEAD(&l_mg
->gc_mid_list
);
841 INIT_LIST_HEAD(&l_mg
->gc_low_list
);
842 INIT_LIST_HEAD(&l_mg
->gc_empty_list
);
843 INIT_LIST_HEAD(&l_mg
->gc_werr_list
);
845 INIT_LIST_HEAD(&l_mg
->emeta_list
);
847 l_mg
->gc_lists
[0] = &l_mg
->gc_werr_list
;
848 l_mg
->gc_lists
[1] = &l_mg
->gc_high_list
;
849 l_mg
->gc_lists
[2] = &l_mg
->gc_mid_list
;
850 l_mg
->gc_lists
[3] = &l_mg
->gc_low_list
;
852 spin_lock_init(&l_mg
->free_lock
);
853 spin_lock_init(&l_mg
->close_lock
);
854 spin_lock_init(&l_mg
->gc_lock
);
856 l_mg
->vsc_list
= kcalloc(l_mg
->nr_lines
, sizeof(__le32
), GFP_KERNEL
);
860 l_mg
->bb_template
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
861 if (!l_mg
->bb_template
)
862 goto fail_free_vsc_list
;
864 l_mg
->bb_aux
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
866 goto fail_free_bb_template
;
868 /* smeta is always small enough to fit on a kmalloc memory allocation,
869 * emeta depends on the number of LUNs allocated to the pblk instance
871 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
872 l_mg
->sline_meta
[i
] = kmalloc(lm
->smeta_len
, GFP_KERNEL
);
873 if (!l_mg
->sline_meta
[i
])
874 goto fail_free_smeta
;
877 l_mg
->bitmap_cache
= kmem_cache_create("pblk_lm_bitmap",
878 lm
->sec_bitmap_len
, 0, 0, NULL
);
879 if (!l_mg
->bitmap_cache
)
880 goto fail_free_smeta
;
882 /* the bitmap pool is used for both valid and map bitmaps */
883 l_mg
->bitmap_pool
= mempool_create_slab_pool(PBLK_DATA_LINES
* 2,
885 if (!l_mg
->bitmap_pool
)
886 goto fail_destroy_bitmap_cache
;
888 /* emeta allocates three different buffers for managing metadata with
889 * in-memory and in-media layouts
891 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
892 struct pblk_emeta
*emeta
;
894 emeta
= kmalloc(sizeof(struct pblk_emeta
), GFP_KERNEL
);
896 goto fail_free_emeta
;
898 emeta
->buf
= kvmalloc(lm
->emeta_len
[0], GFP_KERNEL
);
901 goto fail_free_emeta
;
904 emeta
->nr_entries
= lm
->emeta_sec
[0];
905 l_mg
->eline_meta
[i
] = emeta
;
908 for (i
= 0; i
< l_mg
->nr_lines
; i
++)
909 l_mg
->vsc_list
[i
] = cpu_to_le32(EMPTY_ENTRY
);
911 bb_distance
= (geo
->all_luns
) * geo
->ws_opt
;
912 for (i
= 0; i
< lm
->sec_per_line
; i
+= bb_distance
)
913 bitmap_set(l_mg
->bb_template
, i
, geo
->ws_opt
);
919 kvfree(l_mg
->eline_meta
[i
]->buf
);
920 kfree(l_mg
->eline_meta
[i
]);
923 mempool_destroy(l_mg
->bitmap_pool
);
924 fail_destroy_bitmap_cache
:
925 kmem_cache_destroy(l_mg
->bitmap_cache
);
927 for (i
= 0; i
< PBLK_DATA_LINES
; i
++)
928 kfree(l_mg
->sline_meta
[i
]);
930 fail_free_bb_template
:
931 kfree(l_mg
->bb_template
);
933 kfree(l_mg
->vsc_list
);
938 static int pblk_line_meta_init(struct pblk
*pblk
)
940 struct nvm_tgt_dev
*dev
= pblk
->dev
;
941 struct nvm_geo
*geo
= &dev
->geo
;
942 struct pblk_line_meta
*lm
= &pblk
->lm
;
943 unsigned int smeta_len
, emeta_len
;
946 lm
->sec_per_line
= geo
->clba
* geo
->all_luns
;
947 lm
->blk_per_line
= geo
->all_luns
;
948 lm
->blk_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
949 lm
->sec_bitmap_len
= BITS_TO_LONGS(lm
->sec_per_line
) * sizeof(long);
950 lm
->lun_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
951 lm
->mid_thrs
= lm
->sec_per_line
/ 2;
952 lm
->high_thrs
= lm
->sec_per_line
/ 4;
953 lm
->meta_distance
= (geo
->all_luns
/ 2) * pblk
->min_write_pgs
;
955 /* Calculate necessary pages for smeta. See comment over struct
956 * line_smeta definition
960 lm
->smeta_sec
= i
* geo
->ws_opt
;
961 lm
->smeta_len
= lm
->smeta_sec
* geo
->csecs
;
963 smeta_len
= sizeof(struct line_smeta
) + lm
->lun_bitmap_len
;
964 if (smeta_len
> lm
->smeta_len
) {
969 /* Calculate necessary pages for emeta. See comment over struct
970 * line_emeta definition
974 lm
->emeta_sec
[0] = i
* geo
->ws_opt
;
975 lm
->emeta_len
[0] = lm
->emeta_sec
[0] * geo
->csecs
;
977 emeta_len
= calc_emeta_len(pblk
);
978 if (emeta_len
> lm
->emeta_len
[0]) {
983 lm
->emeta_bb
= geo
->all_luns
> i
? geo
->all_luns
- i
: 0;
985 lm
->min_blk_line
= 1;
986 if (geo
->all_luns
> 1)
987 lm
->min_blk_line
+= DIV_ROUND_UP(lm
->smeta_sec
+
988 lm
->emeta_sec
[0], geo
->clba
);
990 if (lm
->min_blk_line
> lm
->blk_per_line
) {
991 pblk_err(pblk
, "config. not supported. Min. LUN in line:%d\n",
999 static int pblk_lines_init(struct pblk
*pblk
)
1001 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
1002 struct pblk_line
*line
;
1004 int nr_free_chks
= 0;
1007 ret
= pblk_line_meta_init(pblk
);
1011 ret
= pblk_line_mg_init(pblk
);
1015 ret
= pblk_luns_init(pblk
);
1017 goto fail_free_meta
;
1019 chunk_meta
= pblk_get_chunk_meta(pblk
);
1020 if (IS_ERR(chunk_meta
)) {
1021 ret
= PTR_ERR(chunk_meta
);
1022 goto fail_free_luns
;
1025 pblk
->lines
= kcalloc(l_mg
->nr_lines
, sizeof(struct pblk_line
),
1029 goto fail_free_chunk_meta
;
1032 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
1033 line
= &pblk
->lines
[i
];
1035 ret
= pblk_alloc_line_meta(pblk
, line
);
1037 goto fail_free_lines
;
1039 nr_free_chks
+= pblk_setup_line_meta(pblk
, line
, chunk_meta
, i
);
1041 trace_pblk_line_state(pblk_disk_name(pblk
), line
->id
,
1045 if (!nr_free_chks
) {
1046 pblk_err(pblk
, "too many bad blocks prevent for sane instance\n");
1048 goto fail_free_lines
;
1051 ret
= pblk_set_provision(pblk
, nr_free_chks
);
1053 goto fail_free_lines
;
1060 pblk_line_meta_free(l_mg
, &pblk
->lines
[i
]);
1062 fail_free_chunk_meta
:
1067 pblk_line_mg_free(pblk
);
1072 static int pblk_writer_init(struct pblk
*pblk
)
1074 pblk
->writer_ts
= kthread_create(pblk_write_ts
, pblk
, "pblk-writer-t");
1075 if (IS_ERR(pblk
->writer_ts
)) {
1076 int err
= PTR_ERR(pblk
->writer_ts
);
1079 pblk_err(pblk
, "could not allocate writer kthread (%d)\n",
1084 timer_setup(&pblk
->wtimer
, pblk_write_timer_fn
, 0);
1085 mod_timer(&pblk
->wtimer
, jiffies
+ msecs_to_jiffies(100));
1090 static void pblk_writer_stop(struct pblk
*pblk
)
1092 /* The pipeline must be stopped and the write buffer emptied before the
1093 * write thread is stopped
1095 WARN(pblk_rb_read_count(&pblk
->rwb
),
1096 "Stopping not fully persisted write buffer\n");
1098 WARN(pblk_rb_sync_count(&pblk
->rwb
),
1099 "Stopping not fully synced write buffer\n");
1101 del_timer_sync(&pblk
->wtimer
);
1102 if (pblk
->writer_ts
)
1103 kthread_stop(pblk
->writer_ts
);
1106 static void pblk_free(struct pblk
*pblk
)
1108 pblk_lines_free(pblk
);
1109 pblk_l2p_free(pblk
);
1110 pblk_rwb_free(pblk
);
1111 pblk_core_free(pblk
);
1116 static void pblk_tear_down(struct pblk
*pblk
, bool graceful
)
1119 __pblk_pipeline_flush(pblk
);
1120 __pblk_pipeline_stop(pblk
);
1121 pblk_writer_stop(pblk
);
1122 pblk_rb_sync_l2p(&pblk
->rwb
);
1123 pblk_rl_free(&pblk
->rl
);
1125 pblk_debug(pblk
, "consistent tear down (graceful:%d)\n", graceful
);
1128 static void pblk_exit(void *private, bool graceful
)
1130 struct pblk
*pblk
= private;
1132 pblk_gc_exit(pblk
, graceful
);
1133 pblk_tear_down(pblk
, graceful
);
1135 #ifdef CONFIG_NVM_PBLK_DEBUG
1136 pblk_info(pblk
, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
1142 static sector_t
pblk_capacity(void *private)
1144 struct pblk
*pblk
= private;
1146 return pblk
->capacity
* NR_PHY_IN_LOG
;
1149 static void *pblk_init(struct nvm_tgt_dev
*dev
, struct gendisk
*tdisk
,
1152 struct nvm_geo
*geo
= &dev
->geo
;
1153 struct request_queue
*bqueue
= dev
->q
;
1154 struct request_queue
*tqueue
= tdisk
->queue
;
1158 pblk
= kzalloc(sizeof(struct pblk
), GFP_KERNEL
);
1160 return ERR_PTR(-ENOMEM
);
1164 pblk
->state
= PBLK_STATE_RUNNING
;
1165 trace_pblk_state(pblk_disk_name(pblk
), pblk
->state
);
1166 pblk
->gc
.gc_enabled
= 0;
1168 if (!(geo
->version
== NVM_OCSSD_SPEC_12
||
1169 geo
->version
== NVM_OCSSD_SPEC_20
)) {
1170 pblk_err(pblk
, "OCSSD version not supported (%u)\n",
1173 return ERR_PTR(-EINVAL
);
1177 pblk_err(pblk
, "extended metadata not supported\n");
1179 return ERR_PTR(-EINVAL
);
1182 spin_lock_init(&pblk
->resubmit_lock
);
1183 spin_lock_init(&pblk
->trans_lock
);
1184 spin_lock_init(&pblk
->lock
);
1186 #ifdef CONFIG_NVM_PBLK_DEBUG
1187 atomic_long_set(&pblk
->inflight_writes
, 0);
1188 atomic_long_set(&pblk
->padded_writes
, 0);
1189 atomic_long_set(&pblk
->padded_wb
, 0);
1190 atomic_long_set(&pblk
->req_writes
, 0);
1191 atomic_long_set(&pblk
->sub_writes
, 0);
1192 atomic_long_set(&pblk
->sync_writes
, 0);
1193 atomic_long_set(&pblk
->inflight_reads
, 0);
1194 atomic_long_set(&pblk
->cache_reads
, 0);
1195 atomic_long_set(&pblk
->sync_reads
, 0);
1196 atomic_long_set(&pblk
->recov_writes
, 0);
1197 atomic_long_set(&pblk
->recov_writes
, 0);
1198 atomic_long_set(&pblk
->recov_gc_writes
, 0);
1199 atomic_long_set(&pblk
->recov_gc_reads
, 0);
1202 atomic_long_set(&pblk
->read_failed
, 0);
1203 atomic_long_set(&pblk
->read_empty
, 0);
1204 atomic_long_set(&pblk
->read_high_ecc
, 0);
1205 atomic_long_set(&pblk
->read_failed_gc
, 0);
1206 atomic_long_set(&pblk
->write_failed
, 0);
1207 atomic_long_set(&pblk
->erase_failed
, 0);
1209 ret
= pblk_core_init(pblk
);
1211 pblk_err(pblk
, "could not initialize core\n");
1215 ret
= pblk_lines_init(pblk
);
1217 pblk_err(pblk
, "could not initialize lines\n");
1218 goto fail_free_core
;
1221 ret
= pblk_rwb_init(pblk
);
1223 pblk_err(pblk
, "could not initialize write buffer\n");
1224 goto fail_free_lines
;
1227 ret
= pblk_l2p_init(pblk
, flags
& NVM_TARGET_FACTORY
);
1229 pblk_err(pblk
, "could not initialize maps\n");
1233 ret
= pblk_writer_init(pblk
);
1236 pblk_err(pblk
, "could not initialize write thread\n");
1240 ret
= pblk_gc_init(pblk
);
1242 pblk_err(pblk
, "could not initialize gc\n");
1243 goto fail_stop_writer
;
1246 /* inherit the size from the underlying device */
1247 blk_queue_logical_block_size(tqueue
, queue_physical_block_size(bqueue
));
1248 blk_queue_max_hw_sectors(tqueue
, queue_max_hw_sectors(bqueue
));
1250 blk_queue_write_cache(tqueue
, true, false);
1252 tqueue
->limits
.discard_granularity
= geo
->clba
* geo
->csecs
;
1253 tqueue
->limits
.discard_alignment
= 0;
1254 blk_queue_max_discard_sectors(tqueue
, UINT_MAX
>> 9);
1255 blk_queue_flag_set(QUEUE_FLAG_DISCARD
, tqueue
);
1257 pblk_info(pblk
, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1258 geo
->all_luns
, pblk
->l_mg
.nr_lines
,
1259 (unsigned long long)pblk
->capacity
,
1260 pblk
->rwb
.nr_entries
);
1262 wake_up_process(pblk
->writer_ts
);
1264 /* Check if we need to start GC */
1265 pblk_gc_should_kick(pblk
);
1270 pblk_writer_stop(pblk
);
1272 pblk_l2p_free(pblk
);
1274 pblk_rwb_free(pblk
);
1276 pblk_lines_free(pblk
);
1278 pblk_core_free(pblk
);
1281 return ERR_PTR(ret
);
1284 /* physical block device target */
1285 static struct nvm_tgt_type tt_pblk
= {
1287 .version
= {1, 0, 0},
1290 .capacity
= pblk_capacity
,
1295 .sysfs_init
= pblk_sysfs_init
,
1296 .sysfs_exit
= pblk_sysfs_exit
,
1297 .owner
= THIS_MODULE
,
1300 static int __init
pblk_module_init(void)
1304 ret
= bioset_init(&pblk_bio_set
, BIO_POOL_SIZE
, 0, 0);
1307 ret
= nvm_register_tgt_type(&tt_pblk
);
1309 bioset_exit(&pblk_bio_set
);
1313 static void pblk_module_exit(void)
1315 bioset_exit(&pblk_bio_set
);
1316 nvm_unregister_tgt_type(&tt_pblk
);
1319 module_init(pblk_module_init
);
1320 module_exit(pblk_module_exit
);
1321 MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
1322 MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
1323 MODULE_LICENSE("GPL v2");
1324 MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");