2 * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
3 * Copyright (C) 2016 CNEX Labs
4 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
5 * Matias Bjorling <matias@cnexlabs.com>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * Implementation of a physical block-device target for Open-channel SSDs.
18 * pblk-init.c - pblk's initialization.
23 static unsigned int write_buffer_size
;
25 module_param(write_buffer_size
, uint
, 0644);
26 MODULE_PARM_DESC(write_buffer_size
, "number of entries in a write buffer");
28 static struct kmem_cache
*pblk_ws_cache
, *pblk_rec_cache
, *pblk_g_rq_cache
,
30 static DECLARE_RWSEM(pblk_lock
);
31 struct bio_set pblk_bio_set
;
33 static int pblk_rw_io(struct request_queue
*q
, struct pblk
*pblk
,
38 /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
39 * constraint. Writes can be of arbitrary size.
41 if (bio_data_dir(bio
) == READ
) {
42 blk_queue_split(q
, &bio
);
43 ret
= pblk_submit_read(pblk
, bio
);
44 if (ret
== NVM_IO_DONE
&& bio_flagged(bio
, BIO_CLONED
))
50 /* Prevent deadlock in the case of a modest LUN configuration and large
51 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
52 * available for user I/O.
54 if (pblk_get_secs(bio
) > pblk_rl_max_io(&pblk
->rl
))
55 blk_queue_split(q
, &bio
);
57 return pblk_write_to_cache(pblk
, bio
, PBLK_IOTYPE_USER
);
60 static blk_qc_t
pblk_make_rq(struct request_queue
*q
, struct bio
*bio
)
62 struct pblk
*pblk
= q
->queuedata
;
64 if (bio_op(bio
) == REQ_OP_DISCARD
) {
65 pblk_discard(pblk
, bio
);
66 if (!(bio
->bi_opf
& REQ_PREFLUSH
)) {
72 switch (pblk_rw_io(q
, pblk
, bio
)) {
84 static size_t pblk_trans_map_size(struct pblk
*pblk
)
88 if (pblk
->addrf_len
< 32)
91 return entry_size
* pblk
->rl
.nr_secs
;
94 #ifdef CONFIG_NVM_PBLK_DEBUG
95 static u32
pblk_l2p_crc(struct pblk
*pblk
)
100 map_size
= pblk_trans_map_size(pblk
);
101 crc
= crc32_le(crc
, pblk
->trans_map
, map_size
);
106 static void pblk_l2p_free(struct pblk
*pblk
)
108 vfree(pblk
->trans_map
);
111 static int pblk_l2p_recover(struct pblk
*pblk
, bool factory_init
)
113 struct pblk_line
*line
= NULL
;
116 pblk_setup_uuid(pblk
);
118 line
= pblk_recov_l2p(pblk
);
120 pblk_err(pblk
, "could not recover l2p table\n");
125 #ifdef CONFIG_NVM_PBLK_DEBUG
126 pblk_info(pblk
, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
129 /* Free full lines directly as GC has not been started yet */
130 pblk_gc_free_full_lines(pblk
);
133 /* Configure next line for user data */
134 line
= pblk_line_get_first_data(pblk
);
142 static int pblk_l2p_init(struct pblk
*pblk
, bool factory_init
)
149 map_size
= pblk_trans_map_size(pblk
);
150 pblk
->trans_map
= vmalloc(map_size
);
151 if (!pblk
->trans_map
)
154 pblk_ppa_set_empty(&ppa
);
156 for (i
= 0; i
< pblk
->rl
.nr_secs
; i
++)
157 pblk_trans_map_set(pblk
, i
, ppa
);
159 ret
= pblk_l2p_recover(pblk
, factory_init
);
161 vfree(pblk
->trans_map
);
166 static void pblk_rwb_free(struct pblk
*pblk
)
168 if (pblk_rb_tear_down_check(&pblk
->rwb
))
169 pblk_err(pblk
, "write buffer error on tear down\n");
171 pblk_rb_data_free(&pblk
->rwb
);
172 vfree(pblk_rb_entries_ref(&pblk
->rwb
));
175 static int pblk_rwb_init(struct pblk
*pblk
)
177 struct nvm_tgt_dev
*dev
= pblk
->dev
;
178 struct nvm_geo
*geo
= &dev
->geo
;
179 struct pblk_rb_entry
*entries
;
180 unsigned long nr_entries
, buffer_size
;
181 unsigned int power_size
, power_seg_sz
;
184 pgs_in_buffer
= (max(geo
->mw_cunits
, geo
->ws_opt
) + geo
->ws_opt
)
187 if (write_buffer_size
&& (write_buffer_size
> pgs_in_buffer
))
188 buffer_size
= write_buffer_size
;
190 buffer_size
= pgs_in_buffer
;
192 nr_entries
= pblk_rb_calculate_size(buffer_size
);
194 entries
= vzalloc(array_size(nr_entries
, sizeof(struct pblk_rb_entry
)));
198 power_size
= get_count_order(nr_entries
);
199 power_seg_sz
= get_count_order(geo
->csecs
);
201 return pblk_rb_init(&pblk
->rwb
, entries
, power_size
, power_seg_sz
);
204 /* Minimum pages needed within a lun */
205 #define ADDR_POOL_SIZE 64
207 static int pblk_set_addrf_12(struct pblk
*pblk
, struct nvm_geo
*geo
,
208 struct nvm_addrf_12
*dst
)
210 struct nvm_addrf_12
*src
= (struct nvm_addrf_12
*)&geo
->addrf
;
213 /* Re-calculate channel and lun format to adapt to configuration */
214 power_len
= get_count_order(geo
->num_ch
);
215 if (1 << power_len
!= geo
->num_ch
) {
216 pblk_err(pblk
, "supports only power-of-two channel config.\n");
219 dst
->ch_len
= power_len
;
221 power_len
= get_count_order(geo
->num_lun
);
222 if (1 << power_len
!= geo
->num_lun
) {
223 pblk_err(pblk
, "supports only power-of-two LUN config.\n");
226 dst
->lun_len
= power_len
;
228 dst
->blk_len
= src
->blk_len
;
229 dst
->pg_len
= src
->pg_len
;
230 dst
->pln_len
= src
->pln_len
;
231 dst
->sec_len
= src
->sec_len
;
234 dst
->pln_offset
= dst
->sec_len
;
235 dst
->ch_offset
= dst
->pln_offset
+ dst
->pln_len
;
236 dst
->lun_offset
= dst
->ch_offset
+ dst
->ch_len
;
237 dst
->pg_offset
= dst
->lun_offset
+ dst
->lun_len
;
238 dst
->blk_offset
= dst
->pg_offset
+ dst
->pg_len
;
240 dst
->sec_mask
= ((1ULL << dst
->sec_len
) - 1) << dst
->sec_offset
;
241 dst
->pln_mask
= ((1ULL << dst
->pln_len
) - 1) << dst
->pln_offset
;
242 dst
->ch_mask
= ((1ULL << dst
->ch_len
) - 1) << dst
->ch_offset
;
243 dst
->lun_mask
= ((1ULL << dst
->lun_len
) - 1) << dst
->lun_offset
;
244 dst
->pg_mask
= ((1ULL << dst
->pg_len
) - 1) << dst
->pg_offset
;
245 dst
->blk_mask
= ((1ULL << dst
->blk_len
) - 1) << dst
->blk_offset
;
247 return dst
->blk_offset
+ src
->blk_len
;
250 static int pblk_set_addrf_20(struct nvm_geo
*geo
, struct nvm_addrf
*adst
,
251 struct pblk_addrf
*udst
)
253 struct nvm_addrf
*src
= &geo
->addrf
;
255 adst
->ch_len
= get_count_order(geo
->num_ch
);
256 adst
->lun_len
= get_count_order(geo
->num_lun
);
257 adst
->chk_len
= src
->chk_len
;
258 adst
->sec_len
= src
->sec_len
;
260 adst
->sec_offset
= 0;
261 adst
->ch_offset
= adst
->sec_len
;
262 adst
->lun_offset
= adst
->ch_offset
+ adst
->ch_len
;
263 adst
->chk_offset
= adst
->lun_offset
+ adst
->lun_len
;
265 adst
->sec_mask
= ((1ULL << adst
->sec_len
) - 1) << adst
->sec_offset
;
266 adst
->chk_mask
= ((1ULL << adst
->chk_len
) - 1) << adst
->chk_offset
;
267 adst
->lun_mask
= ((1ULL << adst
->lun_len
) - 1) << adst
->lun_offset
;
268 adst
->ch_mask
= ((1ULL << adst
->ch_len
) - 1) << adst
->ch_offset
;
270 udst
->sec_stripe
= geo
->ws_opt
;
271 udst
->ch_stripe
= geo
->num_ch
;
272 udst
->lun_stripe
= geo
->num_lun
;
274 udst
->sec_lun_stripe
= udst
->sec_stripe
* udst
->ch_stripe
;
275 udst
->sec_ws_stripe
= udst
->sec_lun_stripe
* udst
->lun_stripe
;
277 return adst
->chk_offset
+ adst
->chk_len
;
280 static int pblk_set_addrf(struct pblk
*pblk
)
282 struct nvm_tgt_dev
*dev
= pblk
->dev
;
283 struct nvm_geo
*geo
= &dev
->geo
;
286 switch (geo
->version
) {
287 case NVM_OCSSD_SPEC_12
:
288 div_u64_rem(geo
->clba
, pblk
->min_write_pgs
, &mod
);
290 pblk_err(pblk
, "bad configuration of sectors/pages\n");
294 pblk
->addrf_len
= pblk_set_addrf_12(pblk
, geo
,
295 (void *)&pblk
->addrf
);
297 case NVM_OCSSD_SPEC_20
:
298 pblk
->addrf_len
= pblk_set_addrf_20(geo
, (void *)&pblk
->addrf
,
302 pblk_err(pblk
, "OCSSD revision not supported (%d)\n",
310 static int pblk_init_global_caches(struct pblk
*pblk
)
312 down_write(&pblk_lock
);
313 pblk_ws_cache
= kmem_cache_create("pblk_blk_ws",
314 sizeof(struct pblk_line_ws
), 0, 0, NULL
);
315 if (!pblk_ws_cache
) {
316 up_write(&pblk_lock
);
320 pblk_rec_cache
= kmem_cache_create("pblk_rec",
321 sizeof(struct pblk_rec_ctx
), 0, 0, NULL
);
322 if (!pblk_rec_cache
) {
323 kmem_cache_destroy(pblk_ws_cache
);
324 up_write(&pblk_lock
);
328 pblk_g_rq_cache
= kmem_cache_create("pblk_g_rq", pblk_g_rq_size
,
330 if (!pblk_g_rq_cache
) {
331 kmem_cache_destroy(pblk_ws_cache
);
332 kmem_cache_destroy(pblk_rec_cache
);
333 up_write(&pblk_lock
);
337 pblk_w_rq_cache
= kmem_cache_create("pblk_w_rq", pblk_w_rq_size
,
339 if (!pblk_w_rq_cache
) {
340 kmem_cache_destroy(pblk_ws_cache
);
341 kmem_cache_destroy(pblk_rec_cache
);
342 kmem_cache_destroy(pblk_g_rq_cache
);
343 up_write(&pblk_lock
);
346 up_write(&pblk_lock
);
351 static void pblk_free_global_caches(struct pblk
*pblk
)
353 kmem_cache_destroy(pblk_ws_cache
);
354 kmem_cache_destroy(pblk_rec_cache
);
355 kmem_cache_destroy(pblk_g_rq_cache
);
356 kmem_cache_destroy(pblk_w_rq_cache
);
359 static int pblk_core_init(struct pblk
*pblk
)
361 struct nvm_tgt_dev
*dev
= pblk
->dev
;
362 struct nvm_geo
*geo
= &dev
->geo
;
363 int ret
, max_write_ppas
;
365 atomic64_set(&pblk
->user_wa
, 0);
366 atomic64_set(&pblk
->pad_wa
, 0);
367 atomic64_set(&pblk
->gc_wa
, 0);
368 pblk
->user_rst_wa
= 0;
369 pblk
->pad_rst_wa
= 0;
372 atomic64_set(&pblk
->nr_flush
, 0);
373 pblk
->nr_flush_rst
= 0;
375 pblk
->min_write_pgs
= geo
->ws_opt
;
376 max_write_ppas
= pblk
->min_write_pgs
* geo
->all_luns
;
377 pblk
->max_write_pgs
= min_t(int, max_write_ppas
, NVM_MAX_VLBA
);
378 pblk
->max_write_pgs
= min_t(int, pblk
->max_write_pgs
,
379 queue_max_hw_sectors(dev
->q
) / (geo
->csecs
>> SECTOR_SHIFT
));
380 pblk_set_sec_per_write(pblk
, pblk
->min_write_pgs
);
382 if (pblk
->max_write_pgs
> PBLK_MAX_REQ_ADDRS
) {
383 pblk_err(pblk
, "vector list too big(%u > %u)\n",
384 pblk
->max_write_pgs
, PBLK_MAX_REQ_ADDRS
);
388 pblk
->pad_dist
= kcalloc(pblk
->min_write_pgs
- 1, sizeof(atomic64_t
),
393 if (pblk_init_global_caches(pblk
))
394 goto fail_free_pad_dist
;
396 /* Internal bios can be at most the sectors signaled by the device. */
397 ret
= mempool_init_page_pool(&pblk
->page_bio_pool
, NVM_MAX_VLBA
, 0);
399 goto free_global_caches
;
401 ret
= mempool_init_slab_pool(&pblk
->gen_ws_pool
, PBLK_GEN_WS_POOL_SIZE
,
404 goto free_page_bio_pool
;
406 ret
= mempool_init_slab_pool(&pblk
->rec_pool
, geo
->all_luns
,
409 goto free_gen_ws_pool
;
411 ret
= mempool_init_slab_pool(&pblk
->r_rq_pool
, geo
->all_luns
,
416 ret
= mempool_init_slab_pool(&pblk
->e_rq_pool
, geo
->all_luns
,
421 ret
= mempool_init_slab_pool(&pblk
->w_rq_pool
, geo
->all_luns
,
426 pblk
->close_wq
= alloc_workqueue("pblk-close-wq",
427 WQ_MEM_RECLAIM
| WQ_UNBOUND
, PBLK_NR_CLOSE_JOBS
);
431 pblk
->bb_wq
= alloc_workqueue("pblk-bb-wq",
432 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
436 pblk
->r_end_wq
= alloc_workqueue("pblk-read-end-wq",
437 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
441 if (pblk_set_addrf(pblk
))
444 INIT_LIST_HEAD(&pblk
->compl_list
);
445 INIT_LIST_HEAD(&pblk
->resubmit_list
);
450 destroy_workqueue(pblk
->r_end_wq
);
452 destroy_workqueue(pblk
->bb_wq
);
454 destroy_workqueue(pblk
->close_wq
);
456 mempool_exit(&pblk
->w_rq_pool
);
458 mempool_exit(&pblk
->e_rq_pool
);
460 mempool_exit(&pblk
->r_rq_pool
);
462 mempool_exit(&pblk
->rec_pool
);
464 mempool_exit(&pblk
->gen_ws_pool
);
466 mempool_exit(&pblk
->page_bio_pool
);
468 pblk_free_global_caches(pblk
);
470 kfree(pblk
->pad_dist
);
474 static void pblk_core_free(struct pblk
*pblk
)
477 destroy_workqueue(pblk
->close_wq
);
480 destroy_workqueue(pblk
->r_end_wq
);
483 destroy_workqueue(pblk
->bb_wq
);
485 mempool_exit(&pblk
->page_bio_pool
);
486 mempool_exit(&pblk
->gen_ws_pool
);
487 mempool_exit(&pblk
->rec_pool
);
488 mempool_exit(&pblk
->r_rq_pool
);
489 mempool_exit(&pblk
->e_rq_pool
);
490 mempool_exit(&pblk
->w_rq_pool
);
492 pblk_free_global_caches(pblk
);
493 kfree(pblk
->pad_dist
);
496 static void pblk_line_mg_free(struct pblk
*pblk
)
498 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
501 kfree(l_mg
->bb_template
);
503 kfree(l_mg
->vsc_list
);
505 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
506 kfree(l_mg
->sline_meta
[i
]);
507 pblk_mfree(l_mg
->eline_meta
[i
]->buf
, l_mg
->emeta_alloc_type
);
508 kfree(l_mg
->eline_meta
[i
]);
512 static void pblk_line_meta_free(struct pblk_line_mgmt
*l_mg
,
513 struct pblk_line
*line
)
515 struct pblk_w_err_gc
*w_err_gc
= line
->w_err_gc
;
517 kfree(line
->blk_bitmap
);
518 kfree(line
->erase_bitmap
);
521 pblk_mfree(w_err_gc
->lba_list
, l_mg
->emeta_alloc_type
);
525 static void pblk_lines_free(struct pblk
*pblk
)
527 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
528 struct pblk_line
*line
;
531 spin_lock(&l_mg
->free_lock
);
532 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
533 line
= &pblk
->lines
[i
];
535 pblk_line_free(line
);
536 pblk_line_meta_free(l_mg
, line
);
538 spin_unlock(&l_mg
->free_lock
);
540 pblk_line_mg_free(pblk
);
546 static int pblk_bb_get_tbl(struct nvm_tgt_dev
*dev
, struct pblk_lun
*rlun
,
547 u8
*blks
, int nr_blks
)
553 ppa
.g
.ch
= rlun
->bppa
.g
.ch
;
554 ppa
.g
.lun
= rlun
->bppa
.g
.lun
;
556 ret
= nvm_get_tgt_bb_tbl(dev
, ppa
, blks
);
560 nr_blks
= nvm_bb_tbl_fold(dev
->parent
, blks
, nr_blks
);
567 static void *pblk_bb_get_meta(struct pblk
*pblk
)
569 struct nvm_tgt_dev
*dev
= pblk
->dev
;
570 struct nvm_geo
*geo
= &dev
->geo
;
572 int i
, nr_blks
, blk_per_lun
;
575 blk_per_lun
= geo
->num_chk
* geo
->pln_mode
;
576 nr_blks
= blk_per_lun
* geo
->all_luns
;
578 meta
= kmalloc(nr_blks
, GFP_KERNEL
);
580 return ERR_PTR(-ENOMEM
);
582 for (i
= 0; i
< geo
->all_luns
; i
++) {
583 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
584 u8
*meta_pos
= meta
+ i
* blk_per_lun
;
586 ret
= pblk_bb_get_tbl(dev
, rlun
, meta_pos
, blk_per_lun
);
589 return ERR_PTR(-EIO
);
596 static void *pblk_chunk_get_meta(struct pblk
*pblk
)
598 struct nvm_tgt_dev
*dev
= pblk
->dev
;
599 struct nvm_geo
*geo
= &dev
->geo
;
601 if (geo
->version
== NVM_OCSSD_SPEC_12
)
602 return pblk_bb_get_meta(pblk
);
604 return pblk_chunk_get_info(pblk
);
607 static int pblk_luns_init(struct pblk
*pblk
)
609 struct nvm_tgt_dev
*dev
= pblk
->dev
;
610 struct nvm_geo
*geo
= &dev
->geo
;
611 struct pblk_lun
*rlun
;
614 /* TODO: Implement unbalanced LUN support */
615 if (geo
->num_lun
< 0) {
616 pblk_err(pblk
, "unbalanced LUN config.\n");
620 pblk
->luns
= kcalloc(geo
->all_luns
, sizeof(struct pblk_lun
),
625 for (i
= 0; i
< geo
->all_luns
; i
++) {
626 /* Stripe across channels */
627 int ch
= i
% geo
->num_ch
;
628 int lun_raw
= i
/ geo
->num_ch
;
629 int lunid
= lun_raw
+ ch
* geo
->num_lun
;
631 rlun
= &pblk
->luns
[i
];
632 rlun
->bppa
= dev
->luns
[lunid
];
634 sema_init(&rlun
->wr_sem
, 1);
640 /* See comment over struct line_emeta definition */
641 static unsigned int calc_emeta_len(struct pblk
*pblk
)
643 struct pblk_line_meta
*lm
= &pblk
->lm
;
644 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
645 struct nvm_tgt_dev
*dev
= pblk
->dev
;
646 struct nvm_geo
*geo
= &dev
->geo
;
648 /* Round to sector size so that lba_list starts on its own sector */
649 lm
->emeta_sec
[1] = DIV_ROUND_UP(
650 sizeof(struct line_emeta
) + lm
->blk_bitmap_len
+
651 sizeof(struct wa_counters
), geo
->csecs
);
652 lm
->emeta_len
[1] = lm
->emeta_sec
[1] * geo
->csecs
;
654 /* Round to sector size so that vsc_list starts on its own sector */
655 lm
->dsec_per_line
= lm
->sec_per_line
- lm
->emeta_sec
[0];
656 lm
->emeta_sec
[2] = DIV_ROUND_UP(lm
->dsec_per_line
* sizeof(u64
),
658 lm
->emeta_len
[2] = lm
->emeta_sec
[2] * geo
->csecs
;
660 lm
->emeta_sec
[3] = DIV_ROUND_UP(l_mg
->nr_lines
* sizeof(u32
),
662 lm
->emeta_len
[3] = lm
->emeta_sec
[3] * geo
->csecs
;
664 lm
->vsc_list_len
= l_mg
->nr_lines
* sizeof(u32
);
666 return (lm
->emeta_len
[1] + lm
->emeta_len
[2] + lm
->emeta_len
[3]);
669 static void pblk_set_provision(struct pblk
*pblk
, long nr_free_blks
)
671 struct nvm_tgt_dev
*dev
= pblk
->dev
;
672 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
673 struct pblk_line_meta
*lm
= &pblk
->lm
;
674 struct nvm_geo
*geo
= &dev
->geo
;
675 sector_t provisioned
;
676 int sec_meta
, blk_meta
;
678 if (geo
->op
== NVM_TARGET_DEFAULT_OP
)
679 pblk
->op
= PBLK_DEFAULT_OP
;
683 provisioned
= nr_free_blks
;
684 provisioned
*= (100 - pblk
->op
);
685 sector_div(provisioned
, 100);
687 pblk
->op_blks
= nr_free_blks
- provisioned
;
689 /* Internally pblk manages all free blocks, but all calculations based
690 * on user capacity consider only provisioned blocks
692 pblk
->rl
.total_blocks
= nr_free_blks
;
693 pblk
->rl
.nr_secs
= nr_free_blks
* geo
->clba
;
695 /* Consider sectors used for metadata */
696 sec_meta
= (lm
->smeta_sec
+ lm
->emeta_sec
[0]) * l_mg
->nr_free_lines
;
697 blk_meta
= DIV_ROUND_UP(sec_meta
, geo
->clba
);
699 pblk
->capacity
= (provisioned
- blk_meta
) * geo
->clba
;
701 atomic_set(&pblk
->rl
.free_blocks
, nr_free_blks
);
702 atomic_set(&pblk
->rl
.free_user_blocks
, nr_free_blks
);
705 static int pblk_setup_line_meta_12(struct pblk
*pblk
, struct pblk_line
*line
,
708 struct nvm_tgt_dev
*dev
= pblk
->dev
;
709 struct nvm_geo
*geo
= &dev
->geo
;
710 struct pblk_line_meta
*lm
= &pblk
->lm
;
711 int i
, chk_per_lun
, nr_bad_chks
= 0;
713 chk_per_lun
= geo
->num_chk
* geo
->pln_mode
;
715 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
716 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
717 struct nvm_chk_meta
*chunk
;
718 int pos
= pblk_ppa_to_pos(geo
, rlun
->bppa
);
719 u8
*lun_bb_meta
= chunk_meta
+ pos
* chk_per_lun
;
721 chunk
= &line
->chks
[pos
];
724 * In 1.2 spec. chunk state is not persisted by the device. Thus
725 * some of the values are reset each time pblk is instantiated,
726 * so we have to assume that the block is closed.
728 if (lun_bb_meta
[line
->id
] == NVM_BLK_T_FREE
)
729 chunk
->state
= NVM_CHK_ST_CLOSED
;
731 chunk
->state
= NVM_CHK_ST_OFFLINE
;
733 chunk
->type
= NVM_CHK_TP_W_SEQ
;
736 chunk
->cnlb
= geo
->clba
;
739 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
742 set_bit(pos
, line
->blk_bitmap
);
749 static int pblk_setup_line_meta_20(struct pblk
*pblk
, struct pblk_line
*line
,
750 struct nvm_chk_meta
*meta
)
752 struct nvm_tgt_dev
*dev
= pblk
->dev
;
753 struct nvm_geo
*geo
= &dev
->geo
;
754 struct pblk_line_meta
*lm
= &pblk
->lm
;
755 int i
, nr_bad_chks
= 0;
757 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
758 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
759 struct nvm_chk_meta
*chunk
;
760 struct nvm_chk_meta
*chunk_meta
;
765 pos
= pblk_ppa_to_pos(geo
, ppa
);
766 chunk
= &line
->chks
[pos
];
768 ppa
.m
.chk
= line
->id
;
769 chunk_meta
= pblk_chunk_get_off(pblk
, meta
, ppa
);
771 chunk
->state
= chunk_meta
->state
;
772 chunk
->type
= chunk_meta
->type
;
773 chunk
->wi
= chunk_meta
->wi
;
774 chunk
->slba
= chunk_meta
->slba
;
775 chunk
->cnlb
= chunk_meta
->cnlb
;
776 chunk
->wp
= chunk_meta
->wp
;
778 if (chunk
->type
& NVM_CHK_TP_SZ_SPEC
) {
779 WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
783 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
786 set_bit(pos
, line
->blk_bitmap
);
793 static long pblk_setup_line_meta(struct pblk
*pblk
, struct pblk_line
*line
,
794 void *chunk_meta
, int line_id
)
796 struct nvm_tgt_dev
*dev
= pblk
->dev
;
797 struct nvm_geo
*geo
= &dev
->geo
;
798 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
799 struct pblk_line_meta
*lm
= &pblk
->lm
;
800 long nr_bad_chks
, chk_in_line
;
804 line
->type
= PBLK_LINETYPE_FREE
;
805 line
->state
= PBLK_LINESTATE_NEW
;
806 line
->gc_group
= PBLK_LINEGC_NONE
;
807 line
->vsc
= &l_mg
->vsc_list
[line_id
];
808 spin_lock_init(&line
->lock
);
810 if (geo
->version
== NVM_OCSSD_SPEC_12
)
811 nr_bad_chks
= pblk_setup_line_meta_12(pblk
, line
, chunk_meta
);
813 nr_bad_chks
= pblk_setup_line_meta_20(pblk
, line
, chunk_meta
);
815 chk_in_line
= lm
->blk_per_line
- nr_bad_chks
;
816 if (nr_bad_chks
< 0 || nr_bad_chks
> lm
->blk_per_line
||
817 chk_in_line
< lm
->min_blk_line
) {
818 line
->state
= PBLK_LINESTATE_BAD
;
819 list_add_tail(&line
->list
, &l_mg
->bad_list
);
823 atomic_set(&line
->blk_in_line
, chk_in_line
);
824 list_add_tail(&line
->list
, &l_mg
->free_list
);
825 l_mg
->nr_free_lines
++;
830 static int pblk_alloc_line_meta(struct pblk
*pblk
, struct pblk_line
*line
)
832 struct pblk_line_meta
*lm
= &pblk
->lm
;
834 line
->blk_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
835 if (!line
->blk_bitmap
)
838 line
->erase_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
839 if (!line
->erase_bitmap
)
840 goto free_blk_bitmap
;
843 line
->chks
= kmalloc_array(lm
->blk_per_line
,
844 sizeof(struct nvm_chk_meta
), GFP_KERNEL
);
846 goto free_erase_bitmap
;
848 line
->w_err_gc
= kzalloc(sizeof(struct pblk_w_err_gc
), GFP_KERNEL
);
857 kfree(line
->erase_bitmap
);
859 kfree(line
->blk_bitmap
);
863 static int pblk_line_mg_init(struct pblk
*pblk
)
865 struct nvm_tgt_dev
*dev
= pblk
->dev
;
866 struct nvm_geo
*geo
= &dev
->geo
;
867 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
868 struct pblk_line_meta
*lm
= &pblk
->lm
;
871 l_mg
->nr_lines
= geo
->num_chk
;
872 l_mg
->log_line
= l_mg
->data_line
= NULL
;
873 l_mg
->l_seq_nr
= l_mg
->d_seq_nr
= 0;
874 l_mg
->nr_free_lines
= 0;
875 bitmap_zero(&l_mg
->meta_bitmap
, PBLK_DATA_LINES
);
877 INIT_LIST_HEAD(&l_mg
->free_list
);
878 INIT_LIST_HEAD(&l_mg
->corrupt_list
);
879 INIT_LIST_HEAD(&l_mg
->bad_list
);
880 INIT_LIST_HEAD(&l_mg
->gc_full_list
);
881 INIT_LIST_HEAD(&l_mg
->gc_high_list
);
882 INIT_LIST_HEAD(&l_mg
->gc_mid_list
);
883 INIT_LIST_HEAD(&l_mg
->gc_low_list
);
884 INIT_LIST_HEAD(&l_mg
->gc_empty_list
);
885 INIT_LIST_HEAD(&l_mg
->gc_werr_list
);
887 INIT_LIST_HEAD(&l_mg
->emeta_list
);
889 l_mg
->gc_lists
[0] = &l_mg
->gc_werr_list
;
890 l_mg
->gc_lists
[1] = &l_mg
->gc_high_list
;
891 l_mg
->gc_lists
[2] = &l_mg
->gc_mid_list
;
892 l_mg
->gc_lists
[3] = &l_mg
->gc_low_list
;
894 spin_lock_init(&l_mg
->free_lock
);
895 spin_lock_init(&l_mg
->close_lock
);
896 spin_lock_init(&l_mg
->gc_lock
);
898 l_mg
->vsc_list
= kcalloc(l_mg
->nr_lines
, sizeof(__le32
), GFP_KERNEL
);
902 l_mg
->bb_template
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
903 if (!l_mg
->bb_template
)
904 goto fail_free_vsc_list
;
906 l_mg
->bb_aux
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
908 goto fail_free_bb_template
;
910 /* smeta is always small enough to fit on a kmalloc memory allocation,
911 * emeta depends on the number of LUNs allocated to the pblk instance
913 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
914 l_mg
->sline_meta
[i
] = kmalloc(lm
->smeta_len
, GFP_KERNEL
);
915 if (!l_mg
->sline_meta
[i
])
916 goto fail_free_smeta
;
919 /* emeta allocates three different buffers for managing metadata with
920 * in-memory and in-media layouts
922 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
923 struct pblk_emeta
*emeta
;
925 emeta
= kmalloc(sizeof(struct pblk_emeta
), GFP_KERNEL
);
927 goto fail_free_emeta
;
929 if (lm
->emeta_len
[0] > KMALLOC_MAX_CACHE_SIZE
) {
930 l_mg
->emeta_alloc_type
= PBLK_VMALLOC_META
;
932 emeta
->buf
= vmalloc(lm
->emeta_len
[0]);
935 goto fail_free_emeta
;
938 emeta
->nr_entries
= lm
->emeta_sec
[0];
939 l_mg
->eline_meta
[i
] = emeta
;
941 l_mg
->emeta_alloc_type
= PBLK_KMALLOC_META
;
943 emeta
->buf
= kmalloc(lm
->emeta_len
[0], GFP_KERNEL
);
946 goto fail_free_emeta
;
949 emeta
->nr_entries
= lm
->emeta_sec
[0];
950 l_mg
->eline_meta
[i
] = emeta
;
954 for (i
= 0; i
< l_mg
->nr_lines
; i
++)
955 l_mg
->vsc_list
[i
] = cpu_to_le32(EMPTY_ENTRY
);
957 bb_distance
= (geo
->all_luns
) * geo
->ws_opt
;
958 for (i
= 0; i
< lm
->sec_per_line
; i
+= bb_distance
)
959 bitmap_set(l_mg
->bb_template
, i
, geo
->ws_opt
);
965 if (l_mg
->emeta_alloc_type
== PBLK_VMALLOC_META
)
966 vfree(l_mg
->eline_meta
[i
]->buf
);
968 kfree(l_mg
->eline_meta
[i
]->buf
);
969 kfree(l_mg
->eline_meta
[i
]);
972 for (i
= 0; i
< PBLK_DATA_LINES
; i
++)
973 kfree(l_mg
->sline_meta
[i
]);
975 fail_free_bb_template
:
976 kfree(l_mg
->bb_template
);
978 kfree(l_mg
->vsc_list
);
983 static int pblk_line_meta_init(struct pblk
*pblk
)
985 struct nvm_tgt_dev
*dev
= pblk
->dev
;
986 struct nvm_geo
*geo
= &dev
->geo
;
987 struct pblk_line_meta
*lm
= &pblk
->lm
;
988 unsigned int smeta_len
, emeta_len
;
991 lm
->sec_per_line
= geo
->clba
* geo
->all_luns
;
992 lm
->blk_per_line
= geo
->all_luns
;
993 lm
->blk_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
994 lm
->sec_bitmap_len
= BITS_TO_LONGS(lm
->sec_per_line
) * sizeof(long);
995 lm
->lun_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
996 lm
->mid_thrs
= lm
->sec_per_line
/ 2;
997 lm
->high_thrs
= lm
->sec_per_line
/ 4;
998 lm
->meta_distance
= (geo
->all_luns
/ 2) * pblk
->min_write_pgs
;
1000 /* Calculate necessary pages for smeta. See comment over struct
1001 * line_smeta definition
1005 lm
->smeta_sec
= i
* geo
->ws_opt
;
1006 lm
->smeta_len
= lm
->smeta_sec
* geo
->csecs
;
1008 smeta_len
= sizeof(struct line_smeta
) + lm
->lun_bitmap_len
;
1009 if (smeta_len
> lm
->smeta_len
) {
1011 goto add_smeta_page
;
1014 /* Calculate necessary pages for emeta. See comment over struct
1015 * line_emeta definition
1019 lm
->emeta_sec
[0] = i
* geo
->ws_opt
;
1020 lm
->emeta_len
[0] = lm
->emeta_sec
[0] * geo
->csecs
;
1022 emeta_len
= calc_emeta_len(pblk
);
1023 if (emeta_len
> lm
->emeta_len
[0]) {
1025 goto add_emeta_page
;
1028 lm
->emeta_bb
= geo
->all_luns
> i
? geo
->all_luns
- i
: 0;
1030 lm
->min_blk_line
= 1;
1031 if (geo
->all_luns
> 1)
1032 lm
->min_blk_line
+= DIV_ROUND_UP(lm
->smeta_sec
+
1033 lm
->emeta_sec
[0], geo
->clba
);
1035 if (lm
->min_blk_line
> lm
->blk_per_line
) {
1036 pblk_err(pblk
, "config. not supported. Min. LUN in line:%d\n",
1044 static int pblk_lines_init(struct pblk
*pblk
)
1046 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
1047 struct pblk_line
*line
;
1049 long nr_free_chks
= 0;
1052 ret
= pblk_line_meta_init(pblk
);
1056 ret
= pblk_line_mg_init(pblk
);
1060 ret
= pblk_luns_init(pblk
);
1062 goto fail_free_meta
;
1064 chunk_meta
= pblk_chunk_get_meta(pblk
);
1065 if (IS_ERR(chunk_meta
)) {
1066 ret
= PTR_ERR(chunk_meta
);
1067 goto fail_free_luns
;
1070 pblk
->lines
= kcalloc(l_mg
->nr_lines
, sizeof(struct pblk_line
),
1074 goto fail_free_chunk_meta
;
1077 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
1078 line
= &pblk
->lines
[i
];
1080 ret
= pblk_alloc_line_meta(pblk
, line
);
1082 goto fail_free_lines
;
1084 nr_free_chks
+= pblk_setup_line_meta(pblk
, line
, chunk_meta
, i
);
1087 if (!nr_free_chks
) {
1088 pblk_err(pblk
, "too many bad blocks prevent for sane instance\n");
1090 goto fail_free_lines
;
1093 pblk_set_provision(pblk
, nr_free_chks
);
1100 pblk_line_meta_free(l_mg
, &pblk
->lines
[i
]);
1102 fail_free_chunk_meta
:
1107 pblk_line_mg_free(pblk
);
1112 static int pblk_writer_init(struct pblk
*pblk
)
1114 pblk
->writer_ts
= kthread_create(pblk_write_ts
, pblk
, "pblk-writer-t");
1115 if (IS_ERR(pblk
->writer_ts
)) {
1116 int err
= PTR_ERR(pblk
->writer_ts
);
1119 pblk_err(pblk
, "could not allocate writer kthread (%d)\n",
1124 timer_setup(&pblk
->wtimer
, pblk_write_timer_fn
, 0);
1125 mod_timer(&pblk
->wtimer
, jiffies
+ msecs_to_jiffies(100));
1130 static void pblk_writer_stop(struct pblk
*pblk
)
1132 /* The pipeline must be stopped and the write buffer emptied before the
1133 * write thread is stopped
1135 WARN(pblk_rb_read_count(&pblk
->rwb
),
1136 "Stopping not fully persisted write buffer\n");
1138 WARN(pblk_rb_sync_count(&pblk
->rwb
),
1139 "Stopping not fully synced write buffer\n");
1141 del_timer_sync(&pblk
->wtimer
);
1142 if (pblk
->writer_ts
)
1143 kthread_stop(pblk
->writer_ts
);
1146 static void pblk_free(struct pblk
*pblk
)
1148 pblk_lines_free(pblk
);
1149 pblk_l2p_free(pblk
);
1150 pblk_rwb_free(pblk
);
1151 pblk_core_free(pblk
);
1156 static void pblk_tear_down(struct pblk
*pblk
, bool graceful
)
1159 __pblk_pipeline_flush(pblk
);
1160 __pblk_pipeline_stop(pblk
);
1161 pblk_writer_stop(pblk
);
1162 pblk_rb_sync_l2p(&pblk
->rwb
);
1163 pblk_rl_free(&pblk
->rl
);
1165 pblk_debug(pblk
, "consistent tear down (graceful:%d)\n", graceful
);
1168 static void pblk_exit(void *private, bool graceful
)
1170 struct pblk
*pblk
= private;
1172 down_write(&pblk_lock
);
1173 pblk_gc_exit(pblk
, graceful
);
1174 pblk_tear_down(pblk
, graceful
);
1176 #ifdef CONFIG_NVM_PBLK_DEBUG
1177 pblk_info(pblk
, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
1181 up_write(&pblk_lock
);
1184 static sector_t
pblk_capacity(void *private)
1186 struct pblk
*pblk
= private;
1188 return pblk
->capacity
* NR_PHY_IN_LOG
;
1191 static void *pblk_init(struct nvm_tgt_dev
*dev
, struct gendisk
*tdisk
,
1194 struct nvm_geo
*geo
= &dev
->geo
;
1195 struct request_queue
*bqueue
= dev
->q
;
1196 struct request_queue
*tqueue
= tdisk
->queue
;
1200 pblk
= kzalloc(sizeof(struct pblk
), GFP_KERNEL
);
1202 return ERR_PTR(-ENOMEM
);
1206 pblk
->state
= PBLK_STATE_RUNNING
;
1207 pblk
->gc
.gc_enabled
= 0;
1209 if (!(geo
->version
== NVM_OCSSD_SPEC_12
||
1210 geo
->version
== NVM_OCSSD_SPEC_20
)) {
1211 pblk_err(pblk
, "OCSSD version not supported (%u)\n",
1214 return ERR_PTR(-EINVAL
);
1217 if (geo
->version
== NVM_OCSSD_SPEC_12
&& geo
->dom
& NVM_RSP_L2P
) {
1218 pblk_err(pblk
, "host-side L2P table not supported. (%x)\n",
1221 return ERR_PTR(-EINVAL
);
1224 spin_lock_init(&pblk
->resubmit_lock
);
1225 spin_lock_init(&pblk
->trans_lock
);
1226 spin_lock_init(&pblk
->lock
);
1228 #ifdef CONFIG_NVM_PBLK_DEBUG
1229 atomic_long_set(&pblk
->inflight_writes
, 0);
1230 atomic_long_set(&pblk
->padded_writes
, 0);
1231 atomic_long_set(&pblk
->padded_wb
, 0);
1232 atomic_long_set(&pblk
->req_writes
, 0);
1233 atomic_long_set(&pblk
->sub_writes
, 0);
1234 atomic_long_set(&pblk
->sync_writes
, 0);
1235 atomic_long_set(&pblk
->inflight_reads
, 0);
1236 atomic_long_set(&pblk
->cache_reads
, 0);
1237 atomic_long_set(&pblk
->sync_reads
, 0);
1238 atomic_long_set(&pblk
->recov_writes
, 0);
1239 atomic_long_set(&pblk
->recov_writes
, 0);
1240 atomic_long_set(&pblk
->recov_gc_writes
, 0);
1241 atomic_long_set(&pblk
->recov_gc_reads
, 0);
1244 atomic_long_set(&pblk
->read_failed
, 0);
1245 atomic_long_set(&pblk
->read_empty
, 0);
1246 atomic_long_set(&pblk
->read_high_ecc
, 0);
1247 atomic_long_set(&pblk
->read_failed_gc
, 0);
1248 atomic_long_set(&pblk
->write_failed
, 0);
1249 atomic_long_set(&pblk
->erase_failed
, 0);
1251 ret
= pblk_core_init(pblk
);
1253 pblk_err(pblk
, "could not initialize core\n");
1257 ret
= pblk_lines_init(pblk
);
1259 pblk_err(pblk
, "could not initialize lines\n");
1260 goto fail_free_core
;
1263 ret
= pblk_rwb_init(pblk
);
1265 pblk_err(pblk
, "could not initialize write buffer\n");
1266 goto fail_free_lines
;
1269 ret
= pblk_l2p_init(pblk
, flags
& NVM_TARGET_FACTORY
);
1271 pblk_err(pblk
, "could not initialize maps\n");
1275 ret
= pblk_writer_init(pblk
);
1278 pblk_err(pblk
, "could not initialize write thread\n");
1282 ret
= pblk_gc_init(pblk
);
1284 pblk_err(pblk
, "could not initialize gc\n");
1285 goto fail_stop_writer
;
1288 /* inherit the size from the underlying device */
1289 blk_queue_logical_block_size(tqueue
, queue_physical_block_size(bqueue
));
1290 blk_queue_max_hw_sectors(tqueue
, queue_max_hw_sectors(bqueue
));
1292 blk_queue_write_cache(tqueue
, true, false);
1294 tqueue
->limits
.discard_granularity
= geo
->clba
* geo
->csecs
;
1295 tqueue
->limits
.discard_alignment
= 0;
1296 blk_queue_max_discard_sectors(tqueue
, UINT_MAX
>> 9);
1297 blk_queue_flag_set(QUEUE_FLAG_DISCARD
, tqueue
);
1299 pblk_info(pblk
, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1300 geo
->all_luns
, pblk
->l_mg
.nr_lines
,
1301 (unsigned long long)pblk
->rl
.nr_secs
,
1302 pblk
->rwb
.nr_entries
);
1304 wake_up_process(pblk
->writer_ts
);
1306 /* Check if we need to start GC */
1307 pblk_gc_should_kick(pblk
);
1312 pblk_writer_stop(pblk
);
1314 pblk_l2p_free(pblk
);
1316 pblk_rwb_free(pblk
);
1318 pblk_lines_free(pblk
);
1320 pblk_core_free(pblk
);
1323 return ERR_PTR(ret
);
1326 /* physical block device target */
1327 static struct nvm_tgt_type tt_pblk
= {
1329 .version
= {1, 0, 0},
1331 .make_rq
= pblk_make_rq
,
1332 .capacity
= pblk_capacity
,
1337 .sysfs_init
= pblk_sysfs_init
,
1338 .sysfs_exit
= pblk_sysfs_exit
,
1339 .owner
= THIS_MODULE
,
1342 static int __init
pblk_module_init(void)
1346 ret
= bioset_init(&pblk_bio_set
, BIO_POOL_SIZE
, 0, 0);
1349 ret
= nvm_register_tgt_type(&tt_pblk
);
1351 bioset_exit(&pblk_bio_set
);
1355 static void pblk_module_exit(void)
1357 bioset_exit(&pblk_bio_set
);
1358 nvm_unregister_tgt_type(&tt_pblk
);
1361 module_init(pblk_module_init
);
1362 module_exit(pblk_module_exit
);
1363 MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
1364 MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
1365 MODULE_LICENSE("GPL v2");
1366 MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");