1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
4 * Copyright (C) 2016 CNEX Labs
5 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
6 * Matias Bjorling <matias@cnexlabs.com>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version
10 * 2 as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * Implementation of a physical block-device target for Open-channel SSDs.
19 * pblk-init.c - pblk's initialization.
23 #include "pblk-trace.h"
25 static unsigned int write_buffer_size
;
27 module_param(write_buffer_size
, uint
, 0644);
28 MODULE_PARM_DESC(write_buffer_size
, "number of entries in a write buffer");
30 struct pblk_global_caches
{
31 struct kmem_cache
*ws
;
32 struct kmem_cache
*rec
;
33 struct kmem_cache
*g_rq
;
34 struct kmem_cache
*w_rq
;
38 struct mutex mutex
; /* Ensures consistency between
43 static struct pblk_global_caches pblk_caches
= {
44 .mutex
= __MUTEX_INITIALIZER(pblk_caches
.mutex
),
48 struct bio_set pblk_bio_set
;
50 static blk_qc_t
pblk_make_rq(struct request_queue
*q
, struct bio
*bio
)
52 struct pblk
*pblk
= q
->queuedata
;
54 if (bio_op(bio
) == REQ_OP_DISCARD
) {
55 pblk_discard(pblk
, bio
);
56 if (!(bio
->bi_opf
& REQ_PREFLUSH
)) {
62 /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
63 * constraint. Writes can be of arbitrary size.
65 if (bio_data_dir(bio
) == READ
) {
66 blk_queue_split(q
, &bio
);
67 pblk_submit_read(pblk
, bio
);
69 /* Prevent deadlock in the case of a modest LUN configuration
70 * and large user I/Os. Unless stalled, the rate limiter
71 * leaves at least 256KB available for user I/O.
73 if (pblk_get_secs(bio
) > pblk_rl_max_io(&pblk
->rl
))
74 blk_queue_split(q
, &bio
);
76 pblk_write_to_cache(pblk
, bio
, PBLK_IOTYPE_USER
);
82 static size_t pblk_trans_map_size(struct pblk
*pblk
)
86 if (pblk
->addrf_len
< 32)
89 return entry_size
* pblk
->capacity
;
92 #ifdef CONFIG_NVM_PBLK_DEBUG
93 static u32
pblk_l2p_crc(struct pblk
*pblk
)
98 map_size
= pblk_trans_map_size(pblk
);
99 crc
= crc32_le(crc
, pblk
->trans_map
, map_size
);
104 static void pblk_l2p_free(struct pblk
*pblk
)
106 vfree(pblk
->trans_map
);
109 static int pblk_l2p_recover(struct pblk
*pblk
, bool factory_init
)
111 struct pblk_line
*line
= NULL
;
114 guid_gen(&pblk
->instance_uuid
);
116 line
= pblk_recov_l2p(pblk
);
118 pblk_err(pblk
, "could not recover l2p table\n");
123 #ifdef CONFIG_NVM_PBLK_DEBUG
124 pblk_info(pblk
, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
127 /* Free full lines directly as GC has not been started yet */
128 pblk_gc_free_full_lines(pblk
);
131 /* Configure next line for user data */
132 line
= pblk_line_get_first_data(pblk
);
140 static int pblk_l2p_init(struct pblk
*pblk
, bool factory_init
)
147 map_size
= pblk_trans_map_size(pblk
);
148 pblk
->trans_map
= __vmalloc(map_size
, GFP_KERNEL
| __GFP_NOWARN
149 | __GFP_RETRY_MAYFAIL
| __GFP_HIGHMEM
,
151 if (!pblk
->trans_map
) {
152 pblk_err(pblk
, "failed to allocate L2P (need %zu of memory)\n",
157 pblk_ppa_set_empty(&ppa
);
159 for (i
= 0; i
< pblk
->capacity
; i
++)
160 pblk_trans_map_set(pblk
, i
, ppa
);
162 ret
= pblk_l2p_recover(pblk
, factory_init
);
164 vfree(pblk
->trans_map
);
169 static void pblk_rwb_free(struct pblk
*pblk
)
171 if (pblk_rb_tear_down_check(&pblk
->rwb
))
172 pblk_err(pblk
, "write buffer error on tear down\n");
174 pblk_rb_free(&pblk
->rwb
);
177 static int pblk_rwb_init(struct pblk
*pblk
)
179 struct nvm_tgt_dev
*dev
= pblk
->dev
;
180 struct nvm_geo
*geo
= &dev
->geo
;
181 unsigned long buffer_size
;
182 int pgs_in_buffer
, threshold
;
184 threshold
= geo
->mw_cunits
* geo
->all_luns
;
185 pgs_in_buffer
= (max(geo
->mw_cunits
, geo
->ws_opt
) + geo
->ws_opt
)
188 if (write_buffer_size
&& (write_buffer_size
> pgs_in_buffer
))
189 buffer_size
= write_buffer_size
;
191 buffer_size
= pgs_in_buffer
;
193 return pblk_rb_init(&pblk
->rwb
, buffer_size
, threshold
, geo
->csecs
);
196 static int pblk_set_addrf_12(struct pblk
*pblk
, struct nvm_geo
*geo
,
197 struct nvm_addrf_12
*dst
)
199 struct nvm_addrf_12
*src
= (struct nvm_addrf_12
*)&geo
->addrf
;
202 /* Re-calculate channel and lun format to adapt to configuration */
203 power_len
= get_count_order(geo
->num_ch
);
204 if (1 << power_len
!= geo
->num_ch
) {
205 pblk_err(pblk
, "supports only power-of-two channel config.\n");
208 dst
->ch_len
= power_len
;
210 power_len
= get_count_order(geo
->num_lun
);
211 if (1 << power_len
!= geo
->num_lun
) {
212 pblk_err(pblk
, "supports only power-of-two LUN config.\n");
215 dst
->lun_len
= power_len
;
217 dst
->blk_len
= src
->blk_len
;
218 dst
->pg_len
= src
->pg_len
;
219 dst
->pln_len
= src
->pln_len
;
220 dst
->sec_len
= src
->sec_len
;
223 dst
->pln_offset
= dst
->sec_len
;
224 dst
->ch_offset
= dst
->pln_offset
+ dst
->pln_len
;
225 dst
->lun_offset
= dst
->ch_offset
+ dst
->ch_len
;
226 dst
->pg_offset
= dst
->lun_offset
+ dst
->lun_len
;
227 dst
->blk_offset
= dst
->pg_offset
+ dst
->pg_len
;
229 dst
->sec_mask
= ((1ULL << dst
->sec_len
) - 1) << dst
->sec_offset
;
230 dst
->pln_mask
= ((1ULL << dst
->pln_len
) - 1) << dst
->pln_offset
;
231 dst
->ch_mask
= ((1ULL << dst
->ch_len
) - 1) << dst
->ch_offset
;
232 dst
->lun_mask
= ((1ULL << dst
->lun_len
) - 1) << dst
->lun_offset
;
233 dst
->pg_mask
= ((1ULL << dst
->pg_len
) - 1) << dst
->pg_offset
;
234 dst
->blk_mask
= ((1ULL << dst
->blk_len
) - 1) << dst
->blk_offset
;
236 return dst
->blk_offset
+ src
->blk_len
;
239 static int pblk_set_addrf_20(struct nvm_geo
*geo
, struct nvm_addrf
*adst
,
240 struct pblk_addrf
*udst
)
242 struct nvm_addrf
*src
= &geo
->addrf
;
244 adst
->ch_len
= get_count_order(geo
->num_ch
);
245 adst
->lun_len
= get_count_order(geo
->num_lun
);
246 adst
->chk_len
= src
->chk_len
;
247 adst
->sec_len
= src
->sec_len
;
249 adst
->sec_offset
= 0;
250 adst
->ch_offset
= adst
->sec_len
;
251 adst
->lun_offset
= adst
->ch_offset
+ adst
->ch_len
;
252 adst
->chk_offset
= adst
->lun_offset
+ adst
->lun_len
;
254 adst
->sec_mask
= ((1ULL << adst
->sec_len
) - 1) << adst
->sec_offset
;
255 adst
->chk_mask
= ((1ULL << adst
->chk_len
) - 1) << adst
->chk_offset
;
256 adst
->lun_mask
= ((1ULL << adst
->lun_len
) - 1) << adst
->lun_offset
;
257 adst
->ch_mask
= ((1ULL << adst
->ch_len
) - 1) << adst
->ch_offset
;
259 udst
->sec_stripe
= geo
->ws_opt
;
260 udst
->ch_stripe
= geo
->num_ch
;
261 udst
->lun_stripe
= geo
->num_lun
;
263 udst
->sec_lun_stripe
= udst
->sec_stripe
* udst
->ch_stripe
;
264 udst
->sec_ws_stripe
= udst
->sec_lun_stripe
* udst
->lun_stripe
;
266 return adst
->chk_offset
+ adst
->chk_len
;
269 static int pblk_set_addrf(struct pblk
*pblk
)
271 struct nvm_tgt_dev
*dev
= pblk
->dev
;
272 struct nvm_geo
*geo
= &dev
->geo
;
275 switch (geo
->version
) {
276 case NVM_OCSSD_SPEC_12
:
277 div_u64_rem(geo
->clba
, pblk
->min_write_pgs
, &mod
);
279 pblk_err(pblk
, "bad configuration of sectors/pages\n");
283 pblk
->addrf_len
= pblk_set_addrf_12(pblk
, geo
,
284 (void *)&pblk
->addrf
);
286 case NVM_OCSSD_SPEC_20
:
287 pblk
->addrf_len
= pblk_set_addrf_20(geo
, (void *)&pblk
->addrf
,
291 pblk_err(pblk
, "OCSSD revision not supported (%d)\n",
299 static int pblk_create_global_caches(void)
302 pblk_caches
.ws
= kmem_cache_create("pblk_blk_ws",
303 sizeof(struct pblk_line_ws
), 0, 0, NULL
);
307 pblk_caches
.rec
= kmem_cache_create("pblk_rec",
308 sizeof(struct pblk_rec_ctx
), 0, 0, NULL
);
309 if (!pblk_caches
.rec
)
310 goto fail_destroy_ws
;
312 pblk_caches
.g_rq
= kmem_cache_create("pblk_g_rq", pblk_g_rq_size
,
314 if (!pblk_caches
.g_rq
)
315 goto fail_destroy_rec
;
317 pblk_caches
.w_rq
= kmem_cache_create("pblk_w_rq", pblk_w_rq_size
,
319 if (!pblk_caches
.w_rq
)
320 goto fail_destroy_g_rq
;
325 kmem_cache_destroy(pblk_caches
.g_rq
);
327 kmem_cache_destroy(pblk_caches
.rec
);
329 kmem_cache_destroy(pblk_caches
.ws
);
334 static int pblk_get_global_caches(void)
338 mutex_lock(&pblk_caches
.mutex
);
340 if (kref_get_unless_zero(&pblk_caches
.kref
))
343 ret
= pblk_create_global_caches();
345 kref_init(&pblk_caches
.kref
);
348 mutex_unlock(&pblk_caches
.mutex
);
352 static void pblk_destroy_global_caches(struct kref
*ref
)
354 struct pblk_global_caches
*c
;
356 c
= container_of(ref
, struct pblk_global_caches
, kref
);
358 kmem_cache_destroy(c
->ws
);
359 kmem_cache_destroy(c
->rec
);
360 kmem_cache_destroy(c
->g_rq
);
361 kmem_cache_destroy(c
->w_rq
);
364 static void pblk_put_global_caches(void)
366 mutex_lock(&pblk_caches
.mutex
);
367 kref_put(&pblk_caches
.kref
, pblk_destroy_global_caches
);
368 mutex_unlock(&pblk_caches
.mutex
);
371 static int pblk_core_init(struct pblk
*pblk
)
373 struct nvm_tgt_dev
*dev
= pblk
->dev
;
374 struct nvm_geo
*geo
= &dev
->geo
;
375 int ret
, max_write_ppas
;
377 atomic64_set(&pblk
->user_wa
, 0);
378 atomic64_set(&pblk
->pad_wa
, 0);
379 atomic64_set(&pblk
->gc_wa
, 0);
380 pblk
->user_rst_wa
= 0;
381 pblk
->pad_rst_wa
= 0;
384 atomic64_set(&pblk
->nr_flush
, 0);
385 pblk
->nr_flush_rst
= 0;
387 pblk
->min_write_pgs
= geo
->ws_opt
;
388 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
;
389 max_write_ppas
= pblk
->min_write_pgs
* geo
->all_luns
;
390 pblk
->max_write_pgs
= min_t(int, max_write_ppas
, NVM_MAX_VLBA
);
391 pblk
->max_write_pgs
= min_t(int, pblk
->max_write_pgs
,
392 queue_max_hw_sectors(dev
->q
) / (geo
->csecs
>> SECTOR_SHIFT
));
393 pblk_set_sec_per_write(pblk
, pblk
->min_write_pgs
);
395 pblk
->oob_meta_size
= geo
->sos
;
396 if (!pblk_is_oob_meta_supported(pblk
)) {
397 /* For drives which does not have OOB metadata feature
398 * in order to support recovery feature we need to use
399 * so called packed metadata. Packed metada will store
400 * the same information as OOB metadata (l2p table mapping,
401 * but in the form of the single page at the end of
402 * every write request.
404 if (pblk
->min_write_pgs
405 * sizeof(struct pblk_sec_meta
) > PAGE_SIZE
) {
406 /* We want to keep all the packed metadata on single
407 * page per write requests. So we need to ensure that
410 * This is more like sanity check, since there is
411 * no device with such a big minimal write size
412 * (above 1 metabytes).
414 pblk_err(pblk
, "Not supported min write size\n");
417 /* For packed meta approach we do some simplification.
418 * On read path we always issue requests which size
419 * equal to max_write_pgs, with all pages filled with
420 * user payload except of last one page which will be
421 * filled with packed metadata.
423 pblk
->max_write_pgs
= pblk
->min_write_pgs
;
424 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
- 1;
427 pblk
->pad_dist
= kcalloc(pblk
->min_write_pgs
- 1, sizeof(atomic64_t
),
432 if (pblk_get_global_caches())
433 goto fail_free_pad_dist
;
435 /* Internal bios can be at most the sectors signaled by the device. */
436 ret
= mempool_init_page_pool(&pblk
->page_bio_pool
, NVM_MAX_VLBA
, 0);
438 goto free_global_caches
;
440 ret
= mempool_init_slab_pool(&pblk
->gen_ws_pool
, PBLK_GEN_WS_POOL_SIZE
,
443 goto free_page_bio_pool
;
445 ret
= mempool_init_slab_pool(&pblk
->rec_pool
, geo
->all_luns
,
448 goto free_gen_ws_pool
;
450 ret
= mempool_init_slab_pool(&pblk
->r_rq_pool
, geo
->all_luns
,
455 ret
= mempool_init_slab_pool(&pblk
->e_rq_pool
, geo
->all_luns
,
460 ret
= mempool_init_slab_pool(&pblk
->w_rq_pool
, geo
->all_luns
,
465 pblk
->close_wq
= alloc_workqueue("pblk-close-wq",
466 WQ_MEM_RECLAIM
| WQ_UNBOUND
, PBLK_NR_CLOSE_JOBS
);
470 pblk
->bb_wq
= alloc_workqueue("pblk-bb-wq",
471 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
475 pblk
->r_end_wq
= alloc_workqueue("pblk-read-end-wq",
476 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
480 if (pblk_set_addrf(pblk
))
483 INIT_LIST_HEAD(&pblk
->compl_list
);
484 INIT_LIST_HEAD(&pblk
->resubmit_list
);
489 destroy_workqueue(pblk
->r_end_wq
);
491 destroy_workqueue(pblk
->bb_wq
);
493 destroy_workqueue(pblk
->close_wq
);
495 mempool_exit(&pblk
->w_rq_pool
);
497 mempool_exit(&pblk
->e_rq_pool
);
499 mempool_exit(&pblk
->r_rq_pool
);
501 mempool_exit(&pblk
->rec_pool
);
503 mempool_exit(&pblk
->gen_ws_pool
);
505 mempool_exit(&pblk
->page_bio_pool
);
507 pblk_put_global_caches();
509 kfree(pblk
->pad_dist
);
513 static void pblk_core_free(struct pblk
*pblk
)
516 destroy_workqueue(pblk
->close_wq
);
519 destroy_workqueue(pblk
->r_end_wq
);
522 destroy_workqueue(pblk
->bb_wq
);
524 mempool_exit(&pblk
->page_bio_pool
);
525 mempool_exit(&pblk
->gen_ws_pool
);
526 mempool_exit(&pblk
->rec_pool
);
527 mempool_exit(&pblk
->r_rq_pool
);
528 mempool_exit(&pblk
->e_rq_pool
);
529 mempool_exit(&pblk
->w_rq_pool
);
531 pblk_put_global_caches();
532 kfree(pblk
->pad_dist
);
535 static void pblk_line_mg_free(struct pblk
*pblk
)
537 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
540 kfree(l_mg
->bb_template
);
542 kfree(l_mg
->vsc_list
);
544 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
545 kfree(l_mg
->sline_meta
[i
]);
546 kvfree(l_mg
->eline_meta
[i
]->buf
);
547 kfree(l_mg
->eline_meta
[i
]);
550 mempool_destroy(l_mg
->bitmap_pool
);
551 kmem_cache_destroy(l_mg
->bitmap_cache
);
554 static void pblk_line_meta_free(struct pblk_line_mgmt
*l_mg
,
555 struct pblk_line
*line
)
557 struct pblk_w_err_gc
*w_err_gc
= line
->w_err_gc
;
559 kfree(line
->blk_bitmap
);
560 kfree(line
->erase_bitmap
);
563 kvfree(w_err_gc
->lba_list
);
567 static void pblk_lines_free(struct pblk
*pblk
)
569 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
570 struct pblk_line
*line
;
573 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
574 line
= &pblk
->lines
[i
];
576 pblk_line_free(line
);
577 pblk_line_meta_free(l_mg
, line
);
580 pblk_line_mg_free(pblk
);
586 static int pblk_luns_init(struct pblk
*pblk
)
588 struct nvm_tgt_dev
*dev
= pblk
->dev
;
589 struct nvm_geo
*geo
= &dev
->geo
;
590 struct pblk_lun
*rlun
;
593 /* TODO: Implement unbalanced LUN support */
594 if (geo
->num_lun
< 0) {
595 pblk_err(pblk
, "unbalanced LUN config.\n");
599 pblk
->luns
= kcalloc(geo
->all_luns
, sizeof(struct pblk_lun
),
604 for (i
= 0; i
< geo
->all_luns
; i
++) {
605 /* Stripe across channels */
606 int ch
= i
% geo
->num_ch
;
607 int lun_raw
= i
/ geo
->num_ch
;
608 int lunid
= lun_raw
+ ch
* geo
->num_lun
;
610 rlun
= &pblk
->luns
[i
];
611 rlun
->bppa
= dev
->luns
[lunid
];
613 sema_init(&rlun
->wr_sem
, 1);
619 /* See comment over struct line_emeta definition */
620 static unsigned int calc_emeta_len(struct pblk
*pblk
)
622 struct pblk_line_meta
*lm
= &pblk
->lm
;
623 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
624 struct nvm_tgt_dev
*dev
= pblk
->dev
;
625 struct nvm_geo
*geo
= &dev
->geo
;
627 /* Round to sector size so that lba_list starts on its own sector */
628 lm
->emeta_sec
[1] = DIV_ROUND_UP(
629 sizeof(struct line_emeta
) + lm
->blk_bitmap_len
+
630 sizeof(struct wa_counters
), geo
->csecs
);
631 lm
->emeta_len
[1] = lm
->emeta_sec
[1] * geo
->csecs
;
633 /* Round to sector size so that vsc_list starts on its own sector */
634 lm
->dsec_per_line
= lm
->sec_per_line
- lm
->emeta_sec
[0];
635 lm
->emeta_sec
[2] = DIV_ROUND_UP(lm
->dsec_per_line
* sizeof(u64
),
637 lm
->emeta_len
[2] = lm
->emeta_sec
[2] * geo
->csecs
;
639 lm
->emeta_sec
[3] = DIV_ROUND_UP(l_mg
->nr_lines
* sizeof(u32
),
641 lm
->emeta_len
[3] = lm
->emeta_sec
[3] * geo
->csecs
;
643 lm
->vsc_list_len
= l_mg
->nr_lines
* sizeof(u32
);
645 return (lm
->emeta_len
[1] + lm
->emeta_len
[2] + lm
->emeta_len
[3]);
648 static int pblk_set_provision(struct pblk
*pblk
, int nr_free_chks
)
650 struct nvm_tgt_dev
*dev
= pblk
->dev
;
651 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
652 struct pblk_line_meta
*lm
= &pblk
->lm
;
653 struct nvm_geo
*geo
= &dev
->geo
;
654 sector_t provisioned
;
655 int sec_meta
, blk_meta
, clba
;
658 if (geo
->op
== NVM_TARGET_DEFAULT_OP
)
659 pblk
->op
= PBLK_DEFAULT_OP
;
663 minimum
= pblk_get_min_chks(pblk
);
664 provisioned
= nr_free_chks
;
665 provisioned
*= (100 - pblk
->op
);
666 sector_div(provisioned
, 100);
668 if ((nr_free_chks
- provisioned
) < minimum
) {
669 if (geo
->op
!= NVM_TARGET_DEFAULT_OP
) {
670 pblk_err(pblk
, "OP too small to create a sane instance\n");
674 /* If the user did not specify an OP value, and PBLK_DEFAULT_OP
675 * is not enough, calculate and set sane value
678 provisioned
= nr_free_chks
- minimum
;
679 pblk
->op
= (100 * minimum
) / nr_free_chks
;
680 pblk_info(pblk
, "Default OP insufficient, adjusting OP to %d\n",
684 pblk
->op_blks
= nr_free_chks
- provisioned
;
686 /* Internally pblk manages all free blocks, but all calculations based
687 * on user capacity consider only provisioned blocks
689 pblk
->rl
.total_blocks
= nr_free_chks
;
691 /* Consider sectors used for metadata */
692 sec_meta
= (lm
->smeta_sec
+ lm
->emeta_sec
[0]) * l_mg
->nr_free_lines
;
693 blk_meta
= DIV_ROUND_UP(sec_meta
, geo
->clba
);
695 clba
= (geo
->clba
/ pblk
->min_write_pgs
) * pblk
->min_write_pgs_data
;
696 pblk
->capacity
= (provisioned
- blk_meta
) * clba
;
698 atomic_set(&pblk
->rl
.free_blocks
, nr_free_chks
);
699 atomic_set(&pblk
->rl
.free_user_blocks
, nr_free_chks
);
704 static int pblk_setup_line_meta_chk(struct pblk
*pblk
, struct pblk_line
*line
,
705 struct nvm_chk_meta
*meta
)
707 struct nvm_tgt_dev
*dev
= pblk
->dev
;
708 struct nvm_geo
*geo
= &dev
->geo
;
709 struct pblk_line_meta
*lm
= &pblk
->lm
;
710 int i
, nr_bad_chks
= 0;
712 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
713 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
714 struct nvm_chk_meta
*chunk
;
715 struct nvm_chk_meta
*chunk_meta
;
720 pos
= pblk_ppa_to_pos(geo
, ppa
);
721 chunk
= &line
->chks
[pos
];
723 ppa
.m
.chk
= line
->id
;
724 chunk_meta
= pblk_chunk_get_off(pblk
, meta
, ppa
);
726 chunk
->state
= chunk_meta
->state
;
727 chunk
->type
= chunk_meta
->type
;
728 chunk
->wi
= chunk_meta
->wi
;
729 chunk
->slba
= chunk_meta
->slba
;
730 chunk
->cnlb
= chunk_meta
->cnlb
;
731 chunk
->wp
= chunk_meta
->wp
;
733 trace_pblk_chunk_state(pblk_disk_name(pblk
), &ppa
,
736 if (chunk
->type
& NVM_CHK_TP_SZ_SPEC
) {
737 WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
741 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
744 set_bit(pos
, line
->blk_bitmap
);
751 static long pblk_setup_line_meta(struct pblk
*pblk
, struct pblk_line
*line
,
752 void *chunk_meta
, int line_id
)
754 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
755 struct pblk_line_meta
*lm
= &pblk
->lm
;
756 long nr_bad_chks
, chk_in_line
;
760 line
->type
= PBLK_LINETYPE_FREE
;
761 line
->state
= PBLK_LINESTATE_NEW
;
762 line
->gc_group
= PBLK_LINEGC_NONE
;
763 line
->vsc
= &l_mg
->vsc_list
[line_id
];
764 spin_lock_init(&line
->lock
);
766 nr_bad_chks
= pblk_setup_line_meta_chk(pblk
, line
, chunk_meta
);
768 chk_in_line
= lm
->blk_per_line
- nr_bad_chks
;
769 if (nr_bad_chks
< 0 || nr_bad_chks
> lm
->blk_per_line
||
770 chk_in_line
< lm
->min_blk_line
) {
771 line
->state
= PBLK_LINESTATE_BAD
;
772 list_add_tail(&line
->list
, &l_mg
->bad_list
);
776 atomic_set(&line
->blk_in_line
, chk_in_line
);
777 list_add_tail(&line
->list
, &l_mg
->free_list
);
778 l_mg
->nr_free_lines
++;
783 static int pblk_alloc_line_meta(struct pblk
*pblk
, struct pblk_line
*line
)
785 struct pblk_line_meta
*lm
= &pblk
->lm
;
787 line
->blk_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
788 if (!line
->blk_bitmap
)
791 line
->erase_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
792 if (!line
->erase_bitmap
)
793 goto free_blk_bitmap
;
796 line
->chks
= kmalloc_array(lm
->blk_per_line
,
797 sizeof(struct nvm_chk_meta
), GFP_KERNEL
);
799 goto free_erase_bitmap
;
801 line
->w_err_gc
= kzalloc(sizeof(struct pblk_w_err_gc
), GFP_KERNEL
);
810 kfree(line
->erase_bitmap
);
812 kfree(line
->blk_bitmap
);
816 static int pblk_line_mg_init(struct pblk
*pblk
)
818 struct nvm_tgt_dev
*dev
= pblk
->dev
;
819 struct nvm_geo
*geo
= &dev
->geo
;
820 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
821 struct pblk_line_meta
*lm
= &pblk
->lm
;
824 l_mg
->nr_lines
= geo
->num_chk
;
825 l_mg
->log_line
= l_mg
->data_line
= NULL
;
826 l_mg
->l_seq_nr
= l_mg
->d_seq_nr
= 0;
827 l_mg
->nr_free_lines
= 0;
828 bitmap_zero(&l_mg
->meta_bitmap
, PBLK_DATA_LINES
);
830 INIT_LIST_HEAD(&l_mg
->free_list
);
831 INIT_LIST_HEAD(&l_mg
->corrupt_list
);
832 INIT_LIST_HEAD(&l_mg
->bad_list
);
833 INIT_LIST_HEAD(&l_mg
->gc_full_list
);
834 INIT_LIST_HEAD(&l_mg
->gc_high_list
);
835 INIT_LIST_HEAD(&l_mg
->gc_mid_list
);
836 INIT_LIST_HEAD(&l_mg
->gc_low_list
);
837 INIT_LIST_HEAD(&l_mg
->gc_empty_list
);
838 INIT_LIST_HEAD(&l_mg
->gc_werr_list
);
840 INIT_LIST_HEAD(&l_mg
->emeta_list
);
842 l_mg
->gc_lists
[0] = &l_mg
->gc_werr_list
;
843 l_mg
->gc_lists
[1] = &l_mg
->gc_high_list
;
844 l_mg
->gc_lists
[2] = &l_mg
->gc_mid_list
;
845 l_mg
->gc_lists
[3] = &l_mg
->gc_low_list
;
847 spin_lock_init(&l_mg
->free_lock
);
848 spin_lock_init(&l_mg
->close_lock
);
849 spin_lock_init(&l_mg
->gc_lock
);
851 l_mg
->vsc_list
= kcalloc(l_mg
->nr_lines
, sizeof(__le32
), GFP_KERNEL
);
855 l_mg
->bb_template
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
856 if (!l_mg
->bb_template
)
857 goto fail_free_vsc_list
;
859 l_mg
->bb_aux
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
861 goto fail_free_bb_template
;
863 /* smeta is always small enough to fit on a kmalloc memory allocation,
864 * emeta depends on the number of LUNs allocated to the pblk instance
866 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
867 l_mg
->sline_meta
[i
] = kmalloc(lm
->smeta_len
, GFP_KERNEL
);
868 if (!l_mg
->sline_meta
[i
])
869 goto fail_free_smeta
;
872 l_mg
->bitmap_cache
= kmem_cache_create("pblk_lm_bitmap",
873 lm
->sec_bitmap_len
, 0, 0, NULL
);
874 if (!l_mg
->bitmap_cache
)
875 goto fail_free_smeta
;
877 /* the bitmap pool is used for both valid and map bitmaps */
878 l_mg
->bitmap_pool
= mempool_create_slab_pool(PBLK_DATA_LINES
* 2,
880 if (!l_mg
->bitmap_pool
)
881 goto fail_destroy_bitmap_cache
;
883 /* emeta allocates three different buffers for managing metadata with
884 * in-memory and in-media layouts
886 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
887 struct pblk_emeta
*emeta
;
889 emeta
= kmalloc(sizeof(struct pblk_emeta
), GFP_KERNEL
);
891 goto fail_free_emeta
;
893 emeta
->buf
= kvmalloc(lm
->emeta_len
[0], GFP_KERNEL
);
896 goto fail_free_emeta
;
899 emeta
->nr_entries
= lm
->emeta_sec
[0];
900 l_mg
->eline_meta
[i
] = emeta
;
903 for (i
= 0; i
< l_mg
->nr_lines
; i
++)
904 l_mg
->vsc_list
[i
] = cpu_to_le32(EMPTY_ENTRY
);
906 bb_distance
= (geo
->all_luns
) * geo
->ws_opt
;
907 for (i
= 0; i
< lm
->sec_per_line
; i
+= bb_distance
)
908 bitmap_set(l_mg
->bb_template
, i
, geo
->ws_opt
);
914 kvfree(l_mg
->eline_meta
[i
]->buf
);
915 kfree(l_mg
->eline_meta
[i
]);
918 mempool_destroy(l_mg
->bitmap_pool
);
919 fail_destroy_bitmap_cache
:
920 kmem_cache_destroy(l_mg
->bitmap_cache
);
922 for (i
= 0; i
< PBLK_DATA_LINES
; i
++)
923 kfree(l_mg
->sline_meta
[i
]);
925 fail_free_bb_template
:
926 kfree(l_mg
->bb_template
);
928 kfree(l_mg
->vsc_list
);
933 static int pblk_line_meta_init(struct pblk
*pblk
)
935 struct nvm_tgt_dev
*dev
= pblk
->dev
;
936 struct nvm_geo
*geo
= &dev
->geo
;
937 struct pblk_line_meta
*lm
= &pblk
->lm
;
938 unsigned int smeta_len
, emeta_len
;
941 lm
->sec_per_line
= geo
->clba
* geo
->all_luns
;
942 lm
->blk_per_line
= geo
->all_luns
;
943 lm
->blk_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
944 lm
->sec_bitmap_len
= BITS_TO_LONGS(lm
->sec_per_line
) * sizeof(long);
945 lm
->lun_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
946 lm
->mid_thrs
= lm
->sec_per_line
/ 2;
947 lm
->high_thrs
= lm
->sec_per_line
/ 4;
948 lm
->meta_distance
= (geo
->all_luns
/ 2) * pblk
->min_write_pgs
;
950 /* Calculate necessary pages for smeta. See comment over struct
951 * line_smeta definition
955 lm
->smeta_sec
= i
* geo
->ws_opt
;
956 lm
->smeta_len
= lm
->smeta_sec
* geo
->csecs
;
958 smeta_len
= sizeof(struct line_smeta
) + lm
->lun_bitmap_len
;
959 if (smeta_len
> lm
->smeta_len
) {
964 /* Calculate necessary pages for emeta. See comment over struct
965 * line_emeta definition
969 lm
->emeta_sec
[0] = i
* geo
->ws_opt
;
970 lm
->emeta_len
[0] = lm
->emeta_sec
[0] * geo
->csecs
;
972 emeta_len
= calc_emeta_len(pblk
);
973 if (emeta_len
> lm
->emeta_len
[0]) {
978 lm
->emeta_bb
= geo
->all_luns
> i
? geo
->all_luns
- i
: 0;
980 lm
->min_blk_line
= 1;
981 if (geo
->all_luns
> 1)
982 lm
->min_blk_line
+= DIV_ROUND_UP(lm
->smeta_sec
+
983 lm
->emeta_sec
[0], geo
->clba
);
985 if (lm
->min_blk_line
> lm
->blk_per_line
) {
986 pblk_err(pblk
, "config. not supported. Min. LUN in line:%d\n",
994 static int pblk_lines_init(struct pblk
*pblk
)
996 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
997 struct pblk_line
*line
;
999 int nr_free_chks
= 0;
1002 ret
= pblk_line_meta_init(pblk
);
1006 ret
= pblk_line_mg_init(pblk
);
1010 ret
= pblk_luns_init(pblk
);
1012 goto fail_free_meta
;
1014 chunk_meta
= pblk_get_chunk_meta(pblk
);
1015 if (IS_ERR(chunk_meta
)) {
1016 ret
= PTR_ERR(chunk_meta
);
1017 goto fail_free_luns
;
1020 pblk
->lines
= kcalloc(l_mg
->nr_lines
, sizeof(struct pblk_line
),
1024 goto fail_free_chunk_meta
;
1027 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
1028 line
= &pblk
->lines
[i
];
1030 ret
= pblk_alloc_line_meta(pblk
, line
);
1032 goto fail_free_lines
;
1034 nr_free_chks
+= pblk_setup_line_meta(pblk
, line
, chunk_meta
, i
);
1036 trace_pblk_line_state(pblk_disk_name(pblk
), line
->id
,
1040 if (!nr_free_chks
) {
1041 pblk_err(pblk
, "too many bad blocks prevent for sane instance\n");
1043 goto fail_free_lines
;
1046 ret
= pblk_set_provision(pblk
, nr_free_chks
);
1048 goto fail_free_lines
;
1055 pblk_line_meta_free(l_mg
, &pblk
->lines
[i
]);
1057 fail_free_chunk_meta
:
1062 pblk_line_mg_free(pblk
);
1067 static int pblk_writer_init(struct pblk
*pblk
)
1069 pblk
->writer_ts
= kthread_create(pblk_write_ts
, pblk
, "pblk-writer-t");
1070 if (IS_ERR(pblk
->writer_ts
)) {
1071 int err
= PTR_ERR(pblk
->writer_ts
);
1074 pblk_err(pblk
, "could not allocate writer kthread (%d)\n",
1079 timer_setup(&pblk
->wtimer
, pblk_write_timer_fn
, 0);
1080 mod_timer(&pblk
->wtimer
, jiffies
+ msecs_to_jiffies(100));
1085 static void pblk_writer_stop(struct pblk
*pblk
)
1087 /* The pipeline must be stopped and the write buffer emptied before the
1088 * write thread is stopped
1090 WARN(pblk_rb_read_count(&pblk
->rwb
),
1091 "Stopping not fully persisted write buffer\n");
1093 WARN(pblk_rb_sync_count(&pblk
->rwb
),
1094 "Stopping not fully synced write buffer\n");
1096 del_timer_sync(&pblk
->wtimer
);
1097 if (pblk
->writer_ts
)
1098 kthread_stop(pblk
->writer_ts
);
1101 static void pblk_free(struct pblk
*pblk
)
1103 pblk_lines_free(pblk
);
1104 pblk_l2p_free(pblk
);
1105 pblk_rwb_free(pblk
);
1106 pblk_core_free(pblk
);
1111 static void pblk_tear_down(struct pblk
*pblk
, bool graceful
)
1114 __pblk_pipeline_flush(pblk
);
1115 __pblk_pipeline_stop(pblk
);
1116 pblk_writer_stop(pblk
);
1117 pblk_rb_sync_l2p(&pblk
->rwb
);
1118 pblk_rl_free(&pblk
->rl
);
1120 pblk_debug(pblk
, "consistent tear down (graceful:%d)\n", graceful
);
1123 static void pblk_exit(void *private, bool graceful
)
1125 struct pblk
*pblk
= private;
1127 pblk_gc_exit(pblk
, graceful
);
1128 pblk_tear_down(pblk
, graceful
);
1130 #ifdef CONFIG_NVM_PBLK_DEBUG
1131 pblk_info(pblk
, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
1137 static sector_t
pblk_capacity(void *private)
1139 struct pblk
*pblk
= private;
1141 return pblk
->capacity
* NR_PHY_IN_LOG
;
1144 static void *pblk_init(struct nvm_tgt_dev
*dev
, struct gendisk
*tdisk
,
1147 struct nvm_geo
*geo
= &dev
->geo
;
1148 struct request_queue
*bqueue
= dev
->q
;
1149 struct request_queue
*tqueue
= tdisk
->queue
;
1153 pblk
= kzalloc(sizeof(struct pblk
), GFP_KERNEL
);
1155 return ERR_PTR(-ENOMEM
);
1159 pblk
->state
= PBLK_STATE_RUNNING
;
1160 trace_pblk_state(pblk_disk_name(pblk
), pblk
->state
);
1161 pblk
->gc
.gc_enabled
= 0;
1163 if (!(geo
->version
== NVM_OCSSD_SPEC_12
||
1164 geo
->version
== NVM_OCSSD_SPEC_20
)) {
1165 pblk_err(pblk
, "OCSSD version not supported (%u)\n",
1168 return ERR_PTR(-EINVAL
);
1172 pblk_err(pblk
, "extended metadata not supported\n");
1174 return ERR_PTR(-EINVAL
);
1177 spin_lock_init(&pblk
->resubmit_lock
);
1178 spin_lock_init(&pblk
->trans_lock
);
1179 spin_lock_init(&pblk
->lock
);
1181 #ifdef CONFIG_NVM_PBLK_DEBUG
1182 atomic_long_set(&pblk
->inflight_writes
, 0);
1183 atomic_long_set(&pblk
->padded_writes
, 0);
1184 atomic_long_set(&pblk
->padded_wb
, 0);
1185 atomic_long_set(&pblk
->req_writes
, 0);
1186 atomic_long_set(&pblk
->sub_writes
, 0);
1187 atomic_long_set(&pblk
->sync_writes
, 0);
1188 atomic_long_set(&pblk
->inflight_reads
, 0);
1189 atomic_long_set(&pblk
->cache_reads
, 0);
1190 atomic_long_set(&pblk
->sync_reads
, 0);
1191 atomic_long_set(&pblk
->recov_writes
, 0);
1192 atomic_long_set(&pblk
->recov_writes
, 0);
1193 atomic_long_set(&pblk
->recov_gc_writes
, 0);
1194 atomic_long_set(&pblk
->recov_gc_reads
, 0);
1197 atomic_long_set(&pblk
->read_failed
, 0);
1198 atomic_long_set(&pblk
->read_empty
, 0);
1199 atomic_long_set(&pblk
->read_high_ecc
, 0);
1200 atomic_long_set(&pblk
->read_failed_gc
, 0);
1201 atomic_long_set(&pblk
->write_failed
, 0);
1202 atomic_long_set(&pblk
->erase_failed
, 0);
1204 ret
= pblk_core_init(pblk
);
1206 pblk_err(pblk
, "could not initialize core\n");
1210 ret
= pblk_lines_init(pblk
);
1212 pblk_err(pblk
, "could not initialize lines\n");
1213 goto fail_free_core
;
1216 ret
= pblk_rwb_init(pblk
);
1218 pblk_err(pblk
, "could not initialize write buffer\n");
1219 goto fail_free_lines
;
1222 ret
= pblk_l2p_init(pblk
, flags
& NVM_TARGET_FACTORY
);
1224 pblk_err(pblk
, "could not initialize maps\n");
1228 ret
= pblk_writer_init(pblk
);
1231 pblk_err(pblk
, "could not initialize write thread\n");
1235 ret
= pblk_gc_init(pblk
);
1237 pblk_err(pblk
, "could not initialize gc\n");
1238 goto fail_stop_writer
;
1241 /* inherit the size from the underlying device */
1242 blk_queue_logical_block_size(tqueue
, queue_physical_block_size(bqueue
));
1243 blk_queue_max_hw_sectors(tqueue
, queue_max_hw_sectors(bqueue
));
1245 blk_queue_write_cache(tqueue
, true, false);
1247 tqueue
->limits
.discard_granularity
= geo
->clba
* geo
->csecs
;
1248 tqueue
->limits
.discard_alignment
= 0;
1249 blk_queue_max_discard_sectors(tqueue
, UINT_MAX
>> 9);
1250 blk_queue_flag_set(QUEUE_FLAG_DISCARD
, tqueue
);
1252 pblk_info(pblk
, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1253 geo
->all_luns
, pblk
->l_mg
.nr_lines
,
1254 (unsigned long long)pblk
->capacity
,
1255 pblk
->rwb
.nr_entries
);
1257 wake_up_process(pblk
->writer_ts
);
1259 /* Check if we need to start GC */
1260 pblk_gc_should_kick(pblk
);
1265 pblk_writer_stop(pblk
);
1267 pblk_l2p_free(pblk
);
1269 pblk_rwb_free(pblk
);
1271 pblk_lines_free(pblk
);
1273 pblk_core_free(pblk
);
1276 return ERR_PTR(ret
);
1279 /* physical block device target */
1280 static struct nvm_tgt_type tt_pblk
= {
1282 .version
= {1, 0, 0},
1284 .make_rq
= pblk_make_rq
,
1285 .capacity
= pblk_capacity
,
1290 .sysfs_init
= pblk_sysfs_init
,
1291 .sysfs_exit
= pblk_sysfs_exit
,
1292 .owner
= THIS_MODULE
,
1295 static int __init
pblk_module_init(void)
1299 ret
= bioset_init(&pblk_bio_set
, BIO_POOL_SIZE
, 0, 0);
1302 ret
= nvm_register_tgt_type(&tt_pblk
);
1304 bioset_exit(&pblk_bio_set
);
1308 static void pblk_module_exit(void)
1310 bioset_exit(&pblk_bio_set
);
1311 nvm_unregister_tgt_type(&tt_pblk
);
1314 module_init(pblk_module_init
);
1315 module_exit(pblk_module_exit
);
1316 MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
1317 MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
1318 MODULE_LICENSE("GPL v2");
1319 MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");