2 * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
3 * Copyright (C) 2016 CNEX Labs
4 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
5 * Matias Bjorling <matias@cnexlabs.com>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * Implementation of a physical block-device target for Open-channel SSDs.
18 * pblk-init.c - pblk's initialization.
23 static unsigned int write_buffer_size
;
25 module_param(write_buffer_size
, uint
, 0644);
26 MODULE_PARM_DESC(write_buffer_size
, "number of entries in a write buffer");
28 static struct kmem_cache
*pblk_ws_cache
, *pblk_rec_cache
, *pblk_g_rq_cache
,
30 static DECLARE_RWSEM(pblk_lock
);
31 struct bio_set pblk_bio_set
;
33 static int pblk_rw_io(struct request_queue
*q
, struct pblk
*pblk
,
38 /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
39 * constraint. Writes can be of arbitrary size.
41 if (bio_data_dir(bio
) == READ
) {
42 blk_queue_split(q
, &bio
);
43 ret
= pblk_submit_read(pblk
, bio
);
44 if (ret
== NVM_IO_DONE
&& bio_flagged(bio
, BIO_CLONED
))
50 /* Prevent deadlock in the case of a modest LUN configuration and large
51 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
52 * available for user I/O.
54 if (pblk_get_secs(bio
) > pblk_rl_max_io(&pblk
->rl
))
55 blk_queue_split(q
, &bio
);
57 return pblk_write_to_cache(pblk
, bio
, PBLK_IOTYPE_USER
);
60 static blk_qc_t
pblk_make_rq(struct request_queue
*q
, struct bio
*bio
)
62 struct pblk
*pblk
= q
->queuedata
;
64 if (bio_op(bio
) == REQ_OP_DISCARD
) {
65 pblk_discard(pblk
, bio
);
66 if (!(bio
->bi_opf
& REQ_PREFLUSH
)) {
72 switch (pblk_rw_io(q
, pblk
, bio
)) {
84 static size_t pblk_trans_map_size(struct pblk
*pblk
)
88 if (pblk
->addrf_len
< 32)
91 return entry_size
* pblk
->rl
.nr_secs
;
94 #ifdef CONFIG_NVM_PBLK_DEBUG
95 static u32
pblk_l2p_crc(struct pblk
*pblk
)
100 map_size
= pblk_trans_map_size(pblk
);
101 crc
= crc32_le(crc
, pblk
->trans_map
, map_size
);
106 static void pblk_l2p_free(struct pblk
*pblk
)
108 vfree(pblk
->trans_map
);
111 static int pblk_l2p_recover(struct pblk
*pblk
, bool factory_init
)
113 struct pblk_line
*line
= NULL
;
116 pblk_setup_uuid(pblk
);
118 line
= pblk_recov_l2p(pblk
);
120 pblk_err(pblk
, "could not recover l2p table\n");
125 #ifdef CONFIG_NVM_PBLK_DEBUG
126 pblk_info(pblk
, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
129 /* Free full lines directly as GC has not been started yet */
130 pblk_gc_free_full_lines(pblk
);
133 /* Configure next line for user data */
134 line
= pblk_line_get_first_data(pblk
);
142 static int pblk_l2p_init(struct pblk
*pblk
, bool factory_init
)
149 map_size
= pblk_trans_map_size(pblk
);
150 pblk
->trans_map
= vmalloc(map_size
);
151 if (!pblk
->trans_map
)
154 pblk_ppa_set_empty(&ppa
);
156 for (i
= 0; i
< pblk
->rl
.nr_secs
; i
++)
157 pblk_trans_map_set(pblk
, i
, ppa
);
159 ret
= pblk_l2p_recover(pblk
, factory_init
);
161 vfree(pblk
->trans_map
);
166 static void pblk_rwb_free(struct pblk
*pblk
)
168 if (pblk_rb_tear_down_check(&pblk
->rwb
))
169 pblk_err(pblk
, "write buffer error on tear down\n");
171 pblk_rb_data_free(&pblk
->rwb
);
172 vfree(pblk_rb_entries_ref(&pblk
->rwb
));
175 static int pblk_rwb_init(struct pblk
*pblk
)
177 struct nvm_tgt_dev
*dev
= pblk
->dev
;
178 struct nvm_geo
*geo
= &dev
->geo
;
179 struct pblk_rb_entry
*entries
;
180 unsigned long nr_entries
, buffer_size
;
181 unsigned int power_size
, power_seg_sz
;
184 pgs_in_buffer
= max(geo
->mw_cunits
, geo
->ws_opt
) * geo
->all_luns
;
186 if (write_buffer_size
&& (write_buffer_size
> pgs_in_buffer
))
187 buffer_size
= write_buffer_size
;
189 buffer_size
= pgs_in_buffer
;
191 nr_entries
= pblk_rb_calculate_size(buffer_size
);
193 entries
= vzalloc(array_size(nr_entries
, sizeof(struct pblk_rb_entry
)));
197 power_size
= get_count_order(nr_entries
);
198 power_seg_sz
= get_count_order(geo
->csecs
);
200 return pblk_rb_init(&pblk
->rwb
, entries
, power_size
, power_seg_sz
);
203 /* Minimum pages needed within a lun */
204 #define ADDR_POOL_SIZE 64
206 static int pblk_set_addrf_12(struct pblk
*pblk
, struct nvm_geo
*geo
,
207 struct nvm_addrf_12
*dst
)
209 struct nvm_addrf_12
*src
= (struct nvm_addrf_12
*)&geo
->addrf
;
212 /* Re-calculate channel and lun format to adapt to configuration */
213 power_len
= get_count_order(geo
->num_ch
);
214 if (1 << power_len
!= geo
->num_ch
) {
215 pblk_err(pblk
, "supports only power-of-two channel config.\n");
218 dst
->ch_len
= power_len
;
220 power_len
= get_count_order(geo
->num_lun
);
221 if (1 << power_len
!= geo
->num_lun
) {
222 pblk_err(pblk
, "supports only power-of-two LUN config.\n");
225 dst
->lun_len
= power_len
;
227 dst
->blk_len
= src
->blk_len
;
228 dst
->pg_len
= src
->pg_len
;
229 dst
->pln_len
= src
->pln_len
;
230 dst
->sec_len
= src
->sec_len
;
233 dst
->pln_offset
= dst
->sec_len
;
234 dst
->ch_offset
= dst
->pln_offset
+ dst
->pln_len
;
235 dst
->lun_offset
= dst
->ch_offset
+ dst
->ch_len
;
236 dst
->pg_offset
= dst
->lun_offset
+ dst
->lun_len
;
237 dst
->blk_offset
= dst
->pg_offset
+ dst
->pg_len
;
239 dst
->sec_mask
= ((1ULL << dst
->sec_len
) - 1) << dst
->sec_offset
;
240 dst
->pln_mask
= ((1ULL << dst
->pln_len
) - 1) << dst
->pln_offset
;
241 dst
->ch_mask
= ((1ULL << dst
->ch_len
) - 1) << dst
->ch_offset
;
242 dst
->lun_mask
= ((1ULL << dst
->lun_len
) - 1) << dst
->lun_offset
;
243 dst
->pg_mask
= ((1ULL << dst
->pg_len
) - 1) << dst
->pg_offset
;
244 dst
->blk_mask
= ((1ULL << dst
->blk_len
) - 1) << dst
->blk_offset
;
246 return dst
->blk_offset
+ src
->blk_len
;
249 static int pblk_set_addrf_20(struct nvm_geo
*geo
, struct nvm_addrf
*adst
,
250 struct pblk_addrf
*udst
)
252 struct nvm_addrf
*src
= &geo
->addrf
;
254 adst
->ch_len
= get_count_order(geo
->num_ch
);
255 adst
->lun_len
= get_count_order(geo
->num_lun
);
256 adst
->chk_len
= src
->chk_len
;
257 adst
->sec_len
= src
->sec_len
;
259 adst
->sec_offset
= 0;
260 adst
->ch_offset
= adst
->sec_len
;
261 adst
->lun_offset
= adst
->ch_offset
+ adst
->ch_len
;
262 adst
->chk_offset
= adst
->lun_offset
+ adst
->lun_len
;
264 adst
->sec_mask
= ((1ULL << adst
->sec_len
) - 1) << adst
->sec_offset
;
265 adst
->chk_mask
= ((1ULL << adst
->chk_len
) - 1) << adst
->chk_offset
;
266 adst
->lun_mask
= ((1ULL << adst
->lun_len
) - 1) << adst
->lun_offset
;
267 adst
->ch_mask
= ((1ULL << adst
->ch_len
) - 1) << adst
->ch_offset
;
269 udst
->sec_stripe
= geo
->ws_opt
;
270 udst
->ch_stripe
= geo
->num_ch
;
271 udst
->lun_stripe
= geo
->num_lun
;
273 udst
->sec_lun_stripe
= udst
->sec_stripe
* udst
->ch_stripe
;
274 udst
->sec_ws_stripe
= udst
->sec_lun_stripe
* udst
->lun_stripe
;
276 return adst
->chk_offset
+ adst
->chk_len
;
279 static int pblk_set_addrf(struct pblk
*pblk
)
281 struct nvm_tgt_dev
*dev
= pblk
->dev
;
282 struct nvm_geo
*geo
= &dev
->geo
;
285 switch (geo
->version
) {
286 case NVM_OCSSD_SPEC_12
:
287 div_u64_rem(geo
->clba
, pblk
->min_write_pgs
, &mod
);
289 pblk_err(pblk
, "bad configuration of sectors/pages\n");
293 pblk
->addrf_len
= pblk_set_addrf_12(pblk
, geo
,
294 (void *)&pblk
->addrf
);
296 case NVM_OCSSD_SPEC_20
:
297 pblk
->addrf_len
= pblk_set_addrf_20(geo
, (void *)&pblk
->addrf
,
301 pblk_err(pblk
, "OCSSD revision not supported (%d)\n",
309 static int pblk_init_global_caches(struct pblk
*pblk
)
311 down_write(&pblk_lock
);
312 pblk_ws_cache
= kmem_cache_create("pblk_blk_ws",
313 sizeof(struct pblk_line_ws
), 0, 0, NULL
);
314 if (!pblk_ws_cache
) {
315 up_write(&pblk_lock
);
319 pblk_rec_cache
= kmem_cache_create("pblk_rec",
320 sizeof(struct pblk_rec_ctx
), 0, 0, NULL
);
321 if (!pblk_rec_cache
) {
322 kmem_cache_destroy(pblk_ws_cache
);
323 up_write(&pblk_lock
);
327 pblk_g_rq_cache
= kmem_cache_create("pblk_g_rq", pblk_g_rq_size
,
329 if (!pblk_g_rq_cache
) {
330 kmem_cache_destroy(pblk_ws_cache
);
331 kmem_cache_destroy(pblk_rec_cache
);
332 up_write(&pblk_lock
);
336 pblk_w_rq_cache
= kmem_cache_create("pblk_w_rq", pblk_w_rq_size
,
338 if (!pblk_w_rq_cache
) {
339 kmem_cache_destroy(pblk_ws_cache
);
340 kmem_cache_destroy(pblk_rec_cache
);
341 kmem_cache_destroy(pblk_g_rq_cache
);
342 up_write(&pblk_lock
);
345 up_write(&pblk_lock
);
350 static void pblk_free_global_caches(struct pblk
*pblk
)
352 kmem_cache_destroy(pblk_ws_cache
);
353 kmem_cache_destroy(pblk_rec_cache
);
354 kmem_cache_destroy(pblk_g_rq_cache
);
355 kmem_cache_destroy(pblk_w_rq_cache
);
358 static int pblk_core_init(struct pblk
*pblk
)
360 struct nvm_tgt_dev
*dev
= pblk
->dev
;
361 struct nvm_geo
*geo
= &dev
->geo
;
362 int ret
, max_write_ppas
;
364 atomic64_set(&pblk
->user_wa
, 0);
365 atomic64_set(&pblk
->pad_wa
, 0);
366 atomic64_set(&pblk
->gc_wa
, 0);
367 pblk
->user_rst_wa
= 0;
368 pblk
->pad_rst_wa
= 0;
371 atomic64_set(&pblk
->nr_flush
, 0);
372 pblk
->nr_flush_rst
= 0;
374 pblk
->min_write_pgs
= geo
->ws_opt
* (geo
->csecs
/ PAGE_SIZE
);
375 max_write_ppas
= pblk
->min_write_pgs
* geo
->all_luns
;
376 pblk
->max_write_pgs
= min_t(int, max_write_ppas
, NVM_MAX_VLBA
);
377 pblk_set_sec_per_write(pblk
, pblk
->min_write_pgs
);
379 if (pblk
->max_write_pgs
> PBLK_MAX_REQ_ADDRS
) {
380 pblk_err(pblk
, "vector list too big(%u > %u)\n",
381 pblk
->max_write_pgs
, PBLK_MAX_REQ_ADDRS
);
385 pblk
->pad_dist
= kcalloc(pblk
->min_write_pgs
- 1, sizeof(atomic64_t
),
390 if (pblk_init_global_caches(pblk
))
391 goto fail_free_pad_dist
;
393 /* Internal bios can be at most the sectors signaled by the device. */
394 ret
= mempool_init_page_pool(&pblk
->page_bio_pool
, NVM_MAX_VLBA
, 0);
396 goto free_global_caches
;
398 ret
= mempool_init_slab_pool(&pblk
->gen_ws_pool
, PBLK_GEN_WS_POOL_SIZE
,
401 goto free_page_bio_pool
;
403 ret
= mempool_init_slab_pool(&pblk
->rec_pool
, geo
->all_luns
,
406 goto free_gen_ws_pool
;
408 ret
= mempool_init_slab_pool(&pblk
->r_rq_pool
, geo
->all_luns
,
413 ret
= mempool_init_slab_pool(&pblk
->e_rq_pool
, geo
->all_luns
,
418 ret
= mempool_init_slab_pool(&pblk
->w_rq_pool
, geo
->all_luns
,
423 pblk
->close_wq
= alloc_workqueue("pblk-close-wq",
424 WQ_MEM_RECLAIM
| WQ_UNBOUND
, PBLK_NR_CLOSE_JOBS
);
428 pblk
->bb_wq
= alloc_workqueue("pblk-bb-wq",
429 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
433 pblk
->r_end_wq
= alloc_workqueue("pblk-read-end-wq",
434 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
438 if (pblk_set_addrf(pblk
))
441 INIT_LIST_HEAD(&pblk
->compl_list
);
442 INIT_LIST_HEAD(&pblk
->resubmit_list
);
447 destroy_workqueue(pblk
->r_end_wq
);
449 destroy_workqueue(pblk
->bb_wq
);
451 destroy_workqueue(pblk
->close_wq
);
453 mempool_exit(&pblk
->w_rq_pool
);
455 mempool_exit(&pblk
->e_rq_pool
);
457 mempool_exit(&pblk
->r_rq_pool
);
459 mempool_exit(&pblk
->rec_pool
);
461 mempool_exit(&pblk
->gen_ws_pool
);
463 mempool_exit(&pblk
->page_bio_pool
);
465 pblk_free_global_caches(pblk
);
467 kfree(pblk
->pad_dist
);
471 static void pblk_core_free(struct pblk
*pblk
)
474 destroy_workqueue(pblk
->close_wq
);
477 destroy_workqueue(pblk
->r_end_wq
);
480 destroy_workqueue(pblk
->bb_wq
);
482 mempool_exit(&pblk
->page_bio_pool
);
483 mempool_exit(&pblk
->gen_ws_pool
);
484 mempool_exit(&pblk
->rec_pool
);
485 mempool_exit(&pblk
->r_rq_pool
);
486 mempool_exit(&pblk
->e_rq_pool
);
487 mempool_exit(&pblk
->w_rq_pool
);
489 pblk_free_global_caches(pblk
);
490 kfree(pblk
->pad_dist
);
493 static void pblk_line_mg_free(struct pblk
*pblk
)
495 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
498 kfree(l_mg
->bb_template
);
500 kfree(l_mg
->vsc_list
);
502 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
503 kfree(l_mg
->sline_meta
[i
]);
504 pblk_mfree(l_mg
->eline_meta
[i
]->buf
, l_mg
->emeta_alloc_type
);
505 kfree(l_mg
->eline_meta
[i
]);
509 static void pblk_line_meta_free(struct pblk_line_mgmt
*l_mg
,
510 struct pblk_line
*line
)
512 struct pblk_w_err_gc
*w_err_gc
= line
->w_err_gc
;
514 kfree(line
->blk_bitmap
);
515 kfree(line
->erase_bitmap
);
518 pblk_mfree(w_err_gc
->lba_list
, l_mg
->emeta_alloc_type
);
522 static void pblk_lines_free(struct pblk
*pblk
)
524 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
525 struct pblk_line
*line
;
528 spin_lock(&l_mg
->free_lock
);
529 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
530 line
= &pblk
->lines
[i
];
532 pblk_line_free(line
);
533 pblk_line_meta_free(l_mg
, line
);
535 spin_unlock(&l_mg
->free_lock
);
537 pblk_line_mg_free(pblk
);
543 static int pblk_bb_get_tbl(struct nvm_tgt_dev
*dev
, struct pblk_lun
*rlun
,
544 u8
*blks
, int nr_blks
)
550 ppa
.g
.ch
= rlun
->bppa
.g
.ch
;
551 ppa
.g
.lun
= rlun
->bppa
.g
.lun
;
553 ret
= nvm_get_tgt_bb_tbl(dev
, ppa
, blks
);
557 nr_blks
= nvm_bb_tbl_fold(dev
->parent
, blks
, nr_blks
);
564 static void *pblk_bb_get_meta(struct pblk
*pblk
)
566 struct nvm_tgt_dev
*dev
= pblk
->dev
;
567 struct nvm_geo
*geo
= &dev
->geo
;
569 int i
, nr_blks
, blk_per_lun
;
572 blk_per_lun
= geo
->num_chk
* geo
->pln_mode
;
573 nr_blks
= blk_per_lun
* geo
->all_luns
;
575 meta
= kmalloc(nr_blks
, GFP_KERNEL
);
577 return ERR_PTR(-ENOMEM
);
579 for (i
= 0; i
< geo
->all_luns
; i
++) {
580 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
581 u8
*meta_pos
= meta
+ i
* blk_per_lun
;
583 ret
= pblk_bb_get_tbl(dev
, rlun
, meta_pos
, blk_per_lun
);
586 return ERR_PTR(-EIO
);
593 static void *pblk_chunk_get_meta(struct pblk
*pblk
)
595 struct nvm_tgt_dev
*dev
= pblk
->dev
;
596 struct nvm_geo
*geo
= &dev
->geo
;
598 if (geo
->version
== NVM_OCSSD_SPEC_12
)
599 return pblk_bb_get_meta(pblk
);
601 return pblk_chunk_get_info(pblk
);
604 static int pblk_luns_init(struct pblk
*pblk
)
606 struct nvm_tgt_dev
*dev
= pblk
->dev
;
607 struct nvm_geo
*geo
= &dev
->geo
;
608 struct pblk_lun
*rlun
;
611 /* TODO: Implement unbalanced LUN support */
612 if (geo
->num_lun
< 0) {
613 pblk_err(pblk
, "unbalanced LUN config.\n");
617 pblk
->luns
= kcalloc(geo
->all_luns
, sizeof(struct pblk_lun
),
622 for (i
= 0; i
< geo
->all_luns
; i
++) {
623 /* Stripe across channels */
624 int ch
= i
% geo
->num_ch
;
625 int lun_raw
= i
/ geo
->num_ch
;
626 int lunid
= lun_raw
+ ch
* geo
->num_lun
;
628 rlun
= &pblk
->luns
[i
];
629 rlun
->bppa
= dev
->luns
[lunid
];
631 sema_init(&rlun
->wr_sem
, 1);
637 /* See comment over struct line_emeta definition */
638 static unsigned int calc_emeta_len(struct pblk
*pblk
)
640 struct pblk_line_meta
*lm
= &pblk
->lm
;
641 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
642 struct nvm_tgt_dev
*dev
= pblk
->dev
;
643 struct nvm_geo
*geo
= &dev
->geo
;
645 /* Round to sector size so that lba_list starts on its own sector */
646 lm
->emeta_sec
[1] = DIV_ROUND_UP(
647 sizeof(struct line_emeta
) + lm
->blk_bitmap_len
+
648 sizeof(struct wa_counters
), geo
->csecs
);
649 lm
->emeta_len
[1] = lm
->emeta_sec
[1] * geo
->csecs
;
651 /* Round to sector size so that vsc_list starts on its own sector */
652 lm
->dsec_per_line
= lm
->sec_per_line
- lm
->emeta_sec
[0];
653 lm
->emeta_sec
[2] = DIV_ROUND_UP(lm
->dsec_per_line
* sizeof(u64
),
655 lm
->emeta_len
[2] = lm
->emeta_sec
[2] * geo
->csecs
;
657 lm
->emeta_sec
[3] = DIV_ROUND_UP(l_mg
->nr_lines
* sizeof(u32
),
659 lm
->emeta_len
[3] = lm
->emeta_sec
[3] * geo
->csecs
;
661 lm
->vsc_list_len
= l_mg
->nr_lines
* sizeof(u32
);
663 return (lm
->emeta_len
[1] + lm
->emeta_len
[2] + lm
->emeta_len
[3]);
666 static void pblk_set_provision(struct pblk
*pblk
, long nr_free_blks
)
668 struct nvm_tgt_dev
*dev
= pblk
->dev
;
669 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
670 struct pblk_line_meta
*lm
= &pblk
->lm
;
671 struct nvm_geo
*geo
= &dev
->geo
;
672 sector_t provisioned
;
673 int sec_meta
, blk_meta
;
675 if (geo
->op
== NVM_TARGET_DEFAULT_OP
)
676 pblk
->op
= PBLK_DEFAULT_OP
;
680 provisioned
= nr_free_blks
;
681 provisioned
*= (100 - pblk
->op
);
682 sector_div(provisioned
, 100);
684 pblk
->op_blks
= nr_free_blks
- provisioned
;
686 /* Internally pblk manages all free blocks, but all calculations based
687 * on user capacity consider only provisioned blocks
689 pblk
->rl
.total_blocks
= nr_free_blks
;
690 pblk
->rl
.nr_secs
= nr_free_blks
* geo
->clba
;
692 /* Consider sectors used for metadata */
693 sec_meta
= (lm
->smeta_sec
+ lm
->emeta_sec
[0]) * l_mg
->nr_free_lines
;
694 blk_meta
= DIV_ROUND_UP(sec_meta
, geo
->clba
);
696 pblk
->capacity
= (provisioned
- blk_meta
) * geo
->clba
;
698 atomic_set(&pblk
->rl
.free_blocks
, nr_free_blks
);
699 atomic_set(&pblk
->rl
.free_user_blocks
, nr_free_blks
);
702 static int pblk_setup_line_meta_12(struct pblk
*pblk
, struct pblk_line
*line
,
705 struct nvm_tgt_dev
*dev
= pblk
->dev
;
706 struct nvm_geo
*geo
= &dev
->geo
;
707 struct pblk_line_meta
*lm
= &pblk
->lm
;
708 int i
, chk_per_lun
, nr_bad_chks
= 0;
710 chk_per_lun
= geo
->num_chk
* geo
->pln_mode
;
712 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
713 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
714 struct nvm_chk_meta
*chunk
;
715 int pos
= pblk_ppa_to_pos(geo
, rlun
->bppa
);
716 u8
*lun_bb_meta
= chunk_meta
+ pos
* chk_per_lun
;
718 chunk
= &line
->chks
[pos
];
721 * In 1.2 spec. chunk state is not persisted by the device. Thus
722 * some of the values are reset each time pblk is instantiated,
723 * so we have to assume that the block is closed.
725 if (lun_bb_meta
[line
->id
] == NVM_BLK_T_FREE
)
726 chunk
->state
= NVM_CHK_ST_CLOSED
;
728 chunk
->state
= NVM_CHK_ST_OFFLINE
;
730 chunk
->type
= NVM_CHK_TP_W_SEQ
;
733 chunk
->cnlb
= geo
->clba
;
736 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
739 set_bit(pos
, line
->blk_bitmap
);
746 static int pblk_setup_line_meta_20(struct pblk
*pblk
, struct pblk_line
*line
,
747 struct nvm_chk_meta
*meta
)
749 struct nvm_tgt_dev
*dev
= pblk
->dev
;
750 struct nvm_geo
*geo
= &dev
->geo
;
751 struct pblk_line_meta
*lm
= &pblk
->lm
;
752 int i
, nr_bad_chks
= 0;
754 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
755 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
756 struct nvm_chk_meta
*chunk
;
757 struct nvm_chk_meta
*chunk_meta
;
762 pos
= pblk_ppa_to_pos(geo
, ppa
);
763 chunk
= &line
->chks
[pos
];
765 ppa
.m
.chk
= line
->id
;
766 chunk_meta
= pblk_chunk_get_off(pblk
, meta
, ppa
);
768 chunk
->state
= chunk_meta
->state
;
769 chunk
->type
= chunk_meta
->type
;
770 chunk
->wi
= chunk_meta
->wi
;
771 chunk
->slba
= chunk_meta
->slba
;
772 chunk
->cnlb
= chunk_meta
->cnlb
;
773 chunk
->wp
= chunk_meta
->wp
;
775 if (chunk
->type
& NVM_CHK_TP_SZ_SPEC
) {
776 WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
780 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
783 set_bit(pos
, line
->blk_bitmap
);
790 static long pblk_setup_line_meta(struct pblk
*pblk
, struct pblk_line
*line
,
791 void *chunk_meta
, int line_id
)
793 struct nvm_tgt_dev
*dev
= pblk
->dev
;
794 struct nvm_geo
*geo
= &dev
->geo
;
795 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
796 struct pblk_line_meta
*lm
= &pblk
->lm
;
797 long nr_bad_chks
, chk_in_line
;
801 line
->type
= PBLK_LINETYPE_FREE
;
802 line
->state
= PBLK_LINESTATE_NEW
;
803 line
->gc_group
= PBLK_LINEGC_NONE
;
804 line
->vsc
= &l_mg
->vsc_list
[line_id
];
805 spin_lock_init(&line
->lock
);
807 if (geo
->version
== NVM_OCSSD_SPEC_12
)
808 nr_bad_chks
= pblk_setup_line_meta_12(pblk
, line
, chunk_meta
);
810 nr_bad_chks
= pblk_setup_line_meta_20(pblk
, line
, chunk_meta
);
812 chk_in_line
= lm
->blk_per_line
- nr_bad_chks
;
813 if (nr_bad_chks
< 0 || nr_bad_chks
> lm
->blk_per_line
||
814 chk_in_line
< lm
->min_blk_line
) {
815 line
->state
= PBLK_LINESTATE_BAD
;
816 list_add_tail(&line
->list
, &l_mg
->bad_list
);
820 atomic_set(&line
->blk_in_line
, chk_in_line
);
821 list_add_tail(&line
->list
, &l_mg
->free_list
);
822 l_mg
->nr_free_lines
++;
827 static int pblk_alloc_line_meta(struct pblk
*pblk
, struct pblk_line
*line
)
829 struct pblk_line_meta
*lm
= &pblk
->lm
;
831 line
->blk_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
832 if (!line
->blk_bitmap
)
835 line
->erase_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
836 if (!line
->erase_bitmap
)
837 goto free_blk_bitmap
;
840 line
->chks
= kmalloc_array(lm
->blk_per_line
,
841 sizeof(struct nvm_chk_meta
), GFP_KERNEL
);
843 goto free_erase_bitmap
;
845 line
->w_err_gc
= kzalloc(sizeof(struct pblk_w_err_gc
), GFP_KERNEL
);
854 kfree(line
->erase_bitmap
);
856 kfree(line
->blk_bitmap
);
860 static int pblk_line_mg_init(struct pblk
*pblk
)
862 struct nvm_tgt_dev
*dev
= pblk
->dev
;
863 struct nvm_geo
*geo
= &dev
->geo
;
864 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
865 struct pblk_line_meta
*lm
= &pblk
->lm
;
868 l_mg
->nr_lines
= geo
->num_chk
;
869 l_mg
->log_line
= l_mg
->data_line
= NULL
;
870 l_mg
->l_seq_nr
= l_mg
->d_seq_nr
= 0;
871 l_mg
->nr_free_lines
= 0;
872 bitmap_zero(&l_mg
->meta_bitmap
, PBLK_DATA_LINES
);
874 INIT_LIST_HEAD(&l_mg
->free_list
);
875 INIT_LIST_HEAD(&l_mg
->corrupt_list
);
876 INIT_LIST_HEAD(&l_mg
->bad_list
);
877 INIT_LIST_HEAD(&l_mg
->gc_full_list
);
878 INIT_LIST_HEAD(&l_mg
->gc_high_list
);
879 INIT_LIST_HEAD(&l_mg
->gc_mid_list
);
880 INIT_LIST_HEAD(&l_mg
->gc_low_list
);
881 INIT_LIST_HEAD(&l_mg
->gc_empty_list
);
882 INIT_LIST_HEAD(&l_mg
->gc_werr_list
);
884 INIT_LIST_HEAD(&l_mg
->emeta_list
);
886 l_mg
->gc_lists
[0] = &l_mg
->gc_werr_list
;
887 l_mg
->gc_lists
[1] = &l_mg
->gc_high_list
;
888 l_mg
->gc_lists
[2] = &l_mg
->gc_mid_list
;
889 l_mg
->gc_lists
[3] = &l_mg
->gc_low_list
;
891 spin_lock_init(&l_mg
->free_lock
);
892 spin_lock_init(&l_mg
->close_lock
);
893 spin_lock_init(&l_mg
->gc_lock
);
895 l_mg
->vsc_list
= kcalloc(l_mg
->nr_lines
, sizeof(__le32
), GFP_KERNEL
);
899 l_mg
->bb_template
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
900 if (!l_mg
->bb_template
)
901 goto fail_free_vsc_list
;
903 l_mg
->bb_aux
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
905 goto fail_free_bb_template
;
907 /* smeta is always small enough to fit on a kmalloc memory allocation,
908 * emeta depends on the number of LUNs allocated to the pblk instance
910 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
911 l_mg
->sline_meta
[i
] = kmalloc(lm
->smeta_len
, GFP_KERNEL
);
912 if (!l_mg
->sline_meta
[i
])
913 goto fail_free_smeta
;
916 /* emeta allocates three different buffers for managing metadata with
917 * in-memory and in-media layouts
919 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
920 struct pblk_emeta
*emeta
;
922 emeta
= kmalloc(sizeof(struct pblk_emeta
), GFP_KERNEL
);
924 goto fail_free_emeta
;
926 if (lm
->emeta_len
[0] > KMALLOC_MAX_CACHE_SIZE
) {
927 l_mg
->emeta_alloc_type
= PBLK_VMALLOC_META
;
929 emeta
->buf
= vmalloc(lm
->emeta_len
[0]);
932 goto fail_free_emeta
;
935 emeta
->nr_entries
= lm
->emeta_sec
[0];
936 l_mg
->eline_meta
[i
] = emeta
;
938 l_mg
->emeta_alloc_type
= PBLK_KMALLOC_META
;
940 emeta
->buf
= kmalloc(lm
->emeta_len
[0], GFP_KERNEL
);
943 goto fail_free_emeta
;
946 emeta
->nr_entries
= lm
->emeta_sec
[0];
947 l_mg
->eline_meta
[i
] = emeta
;
951 for (i
= 0; i
< l_mg
->nr_lines
; i
++)
952 l_mg
->vsc_list
[i
] = cpu_to_le32(EMPTY_ENTRY
);
954 bb_distance
= (geo
->all_luns
) * geo
->ws_opt
;
955 for (i
= 0; i
< lm
->sec_per_line
; i
+= bb_distance
)
956 bitmap_set(l_mg
->bb_template
, i
, geo
->ws_opt
);
962 if (l_mg
->emeta_alloc_type
== PBLK_VMALLOC_META
)
963 vfree(l_mg
->eline_meta
[i
]->buf
);
965 kfree(l_mg
->eline_meta
[i
]->buf
);
966 kfree(l_mg
->eline_meta
[i
]);
969 for (i
= 0; i
< PBLK_DATA_LINES
; i
++)
970 kfree(l_mg
->sline_meta
[i
]);
972 fail_free_bb_template
:
973 kfree(l_mg
->bb_template
);
975 kfree(l_mg
->vsc_list
);
980 static int pblk_line_meta_init(struct pblk
*pblk
)
982 struct nvm_tgt_dev
*dev
= pblk
->dev
;
983 struct nvm_geo
*geo
= &dev
->geo
;
984 struct pblk_line_meta
*lm
= &pblk
->lm
;
985 unsigned int smeta_len
, emeta_len
;
988 lm
->sec_per_line
= geo
->clba
* geo
->all_luns
;
989 lm
->blk_per_line
= geo
->all_luns
;
990 lm
->blk_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
991 lm
->sec_bitmap_len
= BITS_TO_LONGS(lm
->sec_per_line
) * sizeof(long);
992 lm
->lun_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
993 lm
->mid_thrs
= lm
->sec_per_line
/ 2;
994 lm
->high_thrs
= lm
->sec_per_line
/ 4;
995 lm
->meta_distance
= (geo
->all_luns
/ 2) * pblk
->min_write_pgs
;
997 /* Calculate necessary pages for smeta. See comment over struct
998 * line_smeta definition
1002 lm
->smeta_sec
= i
* geo
->ws_opt
;
1003 lm
->smeta_len
= lm
->smeta_sec
* geo
->csecs
;
1005 smeta_len
= sizeof(struct line_smeta
) + lm
->lun_bitmap_len
;
1006 if (smeta_len
> lm
->smeta_len
) {
1008 goto add_smeta_page
;
1011 /* Calculate necessary pages for emeta. See comment over struct
1012 * line_emeta definition
1016 lm
->emeta_sec
[0] = i
* geo
->ws_opt
;
1017 lm
->emeta_len
[0] = lm
->emeta_sec
[0] * geo
->csecs
;
1019 emeta_len
= calc_emeta_len(pblk
);
1020 if (emeta_len
> lm
->emeta_len
[0]) {
1022 goto add_emeta_page
;
1025 lm
->emeta_bb
= geo
->all_luns
> i
? geo
->all_luns
- i
: 0;
1027 lm
->min_blk_line
= 1;
1028 if (geo
->all_luns
> 1)
1029 lm
->min_blk_line
+= DIV_ROUND_UP(lm
->smeta_sec
+
1030 lm
->emeta_sec
[0], geo
->clba
);
1032 if (lm
->min_blk_line
> lm
->blk_per_line
) {
1033 pblk_err(pblk
, "config. not supported. Min. LUN in line:%d\n",
1041 static int pblk_lines_init(struct pblk
*pblk
)
1043 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
1044 struct pblk_line
*line
;
1046 long nr_free_chks
= 0;
1049 ret
= pblk_line_meta_init(pblk
);
1053 ret
= pblk_line_mg_init(pblk
);
1057 ret
= pblk_luns_init(pblk
);
1059 goto fail_free_meta
;
1061 chunk_meta
= pblk_chunk_get_meta(pblk
);
1062 if (IS_ERR(chunk_meta
)) {
1063 ret
= PTR_ERR(chunk_meta
);
1064 goto fail_free_luns
;
1067 pblk
->lines
= kcalloc(l_mg
->nr_lines
, sizeof(struct pblk_line
),
1071 goto fail_free_chunk_meta
;
1074 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
1075 line
= &pblk
->lines
[i
];
1077 ret
= pblk_alloc_line_meta(pblk
, line
);
1079 goto fail_free_lines
;
1081 nr_free_chks
+= pblk_setup_line_meta(pblk
, line
, chunk_meta
, i
);
1084 if (!nr_free_chks
) {
1085 pblk_err(pblk
, "too many bad blocks prevent for sane instance\n");
1089 pblk_set_provision(pblk
, nr_free_chks
);
1096 pblk_line_meta_free(l_mg
, &pblk
->lines
[i
]);
1098 fail_free_chunk_meta
:
1103 pblk_line_mg_free(pblk
);
1108 static int pblk_writer_init(struct pblk
*pblk
)
1110 pblk
->writer_ts
= kthread_create(pblk_write_ts
, pblk
, "pblk-writer-t");
1111 if (IS_ERR(pblk
->writer_ts
)) {
1112 int err
= PTR_ERR(pblk
->writer_ts
);
1115 pblk_err(pblk
, "could not allocate writer kthread (%d)\n",
1120 timer_setup(&pblk
->wtimer
, pblk_write_timer_fn
, 0);
1121 mod_timer(&pblk
->wtimer
, jiffies
+ msecs_to_jiffies(100));
1126 static void pblk_writer_stop(struct pblk
*pblk
)
1128 /* The pipeline must be stopped and the write buffer emptied before the
1129 * write thread is stopped
1131 WARN(pblk_rb_read_count(&pblk
->rwb
),
1132 "Stopping not fully persisted write buffer\n");
1134 WARN(pblk_rb_sync_count(&pblk
->rwb
),
1135 "Stopping not fully synced write buffer\n");
1137 del_timer_sync(&pblk
->wtimer
);
1138 if (pblk
->writer_ts
)
1139 kthread_stop(pblk
->writer_ts
);
1142 static void pblk_free(struct pblk
*pblk
)
1144 pblk_lines_free(pblk
);
1145 pblk_l2p_free(pblk
);
1146 pblk_rwb_free(pblk
);
1147 pblk_core_free(pblk
);
1152 static void pblk_tear_down(struct pblk
*pblk
, bool graceful
)
1155 __pblk_pipeline_flush(pblk
);
1156 __pblk_pipeline_stop(pblk
);
1157 pblk_writer_stop(pblk
);
1158 pblk_rb_sync_l2p(&pblk
->rwb
);
1159 pblk_rl_free(&pblk
->rl
);
1161 pblk_debug(pblk
, "consistent tear down (graceful:%d)\n", graceful
);
1164 static void pblk_exit(void *private, bool graceful
)
1166 struct pblk
*pblk
= private;
1168 down_write(&pblk_lock
);
1169 pblk_gc_exit(pblk
, graceful
);
1170 pblk_tear_down(pblk
, graceful
);
1172 #ifdef CONFIG_NVM_PBLK_DEBUG
1173 pblk_info(pblk
, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
1177 up_write(&pblk_lock
);
1180 static sector_t
pblk_capacity(void *private)
1182 struct pblk
*pblk
= private;
1184 return pblk
->capacity
* NR_PHY_IN_LOG
;
1187 static void *pblk_init(struct nvm_tgt_dev
*dev
, struct gendisk
*tdisk
,
1190 struct nvm_geo
*geo
= &dev
->geo
;
1191 struct request_queue
*bqueue
= dev
->q
;
1192 struct request_queue
*tqueue
= tdisk
->queue
;
1196 pblk
= kzalloc(sizeof(struct pblk
), GFP_KERNEL
);
1198 return ERR_PTR(-ENOMEM
);
1202 pblk
->state
= PBLK_STATE_RUNNING
;
1203 pblk
->gc
.gc_enabled
= 0;
1205 if (!(geo
->version
== NVM_OCSSD_SPEC_12
||
1206 geo
->version
== NVM_OCSSD_SPEC_20
)) {
1207 pblk_err(pblk
, "OCSSD version not supported (%u)\n",
1210 return ERR_PTR(-EINVAL
);
1213 if (geo
->version
== NVM_OCSSD_SPEC_12
&& geo
->dom
& NVM_RSP_L2P
) {
1214 pblk_err(pblk
, "host-side L2P table not supported. (%x)\n",
1217 return ERR_PTR(-EINVAL
);
1220 spin_lock_init(&pblk
->resubmit_lock
);
1221 spin_lock_init(&pblk
->trans_lock
);
1222 spin_lock_init(&pblk
->lock
);
1224 #ifdef CONFIG_NVM_PBLK_DEBUG
1225 atomic_long_set(&pblk
->inflight_writes
, 0);
1226 atomic_long_set(&pblk
->padded_writes
, 0);
1227 atomic_long_set(&pblk
->padded_wb
, 0);
1228 atomic_long_set(&pblk
->req_writes
, 0);
1229 atomic_long_set(&pblk
->sub_writes
, 0);
1230 atomic_long_set(&pblk
->sync_writes
, 0);
1231 atomic_long_set(&pblk
->inflight_reads
, 0);
1232 atomic_long_set(&pblk
->cache_reads
, 0);
1233 atomic_long_set(&pblk
->sync_reads
, 0);
1234 atomic_long_set(&pblk
->recov_writes
, 0);
1235 atomic_long_set(&pblk
->recov_writes
, 0);
1236 atomic_long_set(&pblk
->recov_gc_writes
, 0);
1237 atomic_long_set(&pblk
->recov_gc_reads
, 0);
1240 atomic_long_set(&pblk
->read_failed
, 0);
1241 atomic_long_set(&pblk
->read_empty
, 0);
1242 atomic_long_set(&pblk
->read_high_ecc
, 0);
1243 atomic_long_set(&pblk
->read_failed_gc
, 0);
1244 atomic_long_set(&pblk
->write_failed
, 0);
1245 atomic_long_set(&pblk
->erase_failed
, 0);
1247 ret
= pblk_core_init(pblk
);
1249 pblk_err(pblk
, "could not initialize core\n");
1253 ret
= pblk_lines_init(pblk
);
1255 pblk_err(pblk
, "could not initialize lines\n");
1256 goto fail_free_core
;
1259 ret
= pblk_rwb_init(pblk
);
1261 pblk_err(pblk
, "could not initialize write buffer\n");
1262 goto fail_free_lines
;
1265 ret
= pblk_l2p_init(pblk
, flags
& NVM_TARGET_FACTORY
);
1267 pblk_err(pblk
, "could not initialize maps\n");
1271 ret
= pblk_writer_init(pblk
);
1274 pblk_err(pblk
, "could not initialize write thread\n");
1278 ret
= pblk_gc_init(pblk
);
1280 pblk_err(pblk
, "could not initialize gc\n");
1281 goto fail_stop_writer
;
1284 /* inherit the size from the underlying device */
1285 blk_queue_logical_block_size(tqueue
, queue_physical_block_size(bqueue
));
1286 blk_queue_max_hw_sectors(tqueue
, queue_max_hw_sectors(bqueue
));
1288 blk_queue_write_cache(tqueue
, true, false);
1290 tqueue
->limits
.discard_granularity
= geo
->clba
* geo
->csecs
;
1291 tqueue
->limits
.discard_alignment
= 0;
1292 blk_queue_max_discard_sectors(tqueue
, UINT_MAX
>> 9);
1293 blk_queue_flag_set(QUEUE_FLAG_DISCARD
, tqueue
);
1295 pblk_info(pblk
, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1296 geo
->all_luns
, pblk
->l_mg
.nr_lines
,
1297 (unsigned long long)pblk
->rl
.nr_secs
,
1298 pblk
->rwb
.nr_entries
);
1300 wake_up_process(pblk
->writer_ts
);
1302 /* Check if we need to start GC */
1303 pblk_gc_should_kick(pblk
);
1308 pblk_writer_stop(pblk
);
1310 pblk_l2p_free(pblk
);
1312 pblk_rwb_free(pblk
);
1314 pblk_lines_free(pblk
);
1316 pblk_core_free(pblk
);
1319 return ERR_PTR(ret
);
1322 /* physical block device target */
1323 static struct nvm_tgt_type tt_pblk
= {
1325 .version
= {1, 0, 0},
1327 .make_rq
= pblk_make_rq
,
1328 .capacity
= pblk_capacity
,
1333 .sysfs_init
= pblk_sysfs_init
,
1334 .sysfs_exit
= pblk_sysfs_exit
,
1335 .owner
= THIS_MODULE
,
1338 static int __init
pblk_module_init(void)
1342 ret
= bioset_init(&pblk_bio_set
, BIO_POOL_SIZE
, 0, 0);
1345 ret
= nvm_register_tgt_type(&tt_pblk
);
1347 bioset_exit(&pblk_bio_set
);
1351 static void pblk_module_exit(void)
1353 bioset_exit(&pblk_bio_set
);
1354 nvm_unregister_tgt_type(&tt_pblk
);
1357 module_init(pblk_module_init
);
1358 module_exit(pblk_module_exit
);
1359 MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
1360 MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
1361 MODULE_LICENSE("GPL v2");
1362 MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");