2 * The Guest block driver
4 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5 * The mechanism is simple: we place the information about the request in the
6 * device page, then use SEND_DMA (containing the data for a write, or an empty
7 * "ping" DMA for a read).
9 /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <linux/init.h>
27 #include <linux/types.h>
28 #include <linux/blkdev.h>
29 #include <linux/interrupt.h>
30 #include <linux/lguest_bus.h>
32 static char next_block_index
= 'a';
34 /*D:420 Here is the structure which holds all the information we need about
35 * each Guest block device.
37 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39 * my blog". I think Real adventures have boring bits, too, and you're in the
40 * middle of one. But it gets better. Just not quite yet. */
43 /* The block queue infrastructure wants a spinlock: it is held while it
44 * calls our block request function. We grab it in our interrupt
45 * handler so the responses don't mess with new requests. */
48 /* The disk structure registered with kernel. */
51 /* The major device number for this disk, and the interrupt. We only
52 * really keep them here for completeness; we'd need them if we
53 * supported device unplugging. */
57 /* The physical address of this device's memory page */
58 unsigned long phys_addr
;
59 /* The mapped memory page for convenient acces. */
60 struct lguest_block_page
*lb_page
;
62 /* We only have a single request outstanding at a time: this is it. */
63 struct lguest_dma dma
;
67 /*D:495 We originally used end_request() throughout the driver, but it turns
68 * out that end_request() is deprecated, and doesn't actually end the request
69 * (which seems like a good reason to deprecate it!). It simply ends the first
70 * bio. So if we had 3 bios in a "struct request" we would do all 3,
71 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72 * work as we needed to do.
74 * This reinforced to me that I do not understand the block layer.
76 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77 * request. This improved disk speed by 130%. */
78 static void end_entire_request(struct request
*req
, int uptodate
)
80 if (end_that_request_first(req
, uptodate
, req
->hard_nr_sectors
))
82 add_disk_randomness(req
->rq_disk
);
83 blkdev_dequeue_request(req
);
84 end_that_request_last(req
, uptodate
);
87 /* I'm told there are only two stories in the world worth telling: love and
88 * hate. So there used to be a love scene here like this:
90 * Launcher: We could make beautiful I/O together, you and I.
91 * Guest: My, that's a big disk!
93 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
95 /*D:490 This is the interrupt handler, called when a block read or write has
96 * been completed for us. */
97 static irqreturn_t
lgb_irq(int irq
, void *_bd
)
99 /* We handed our "struct blockdev" as the argument to request_irq(), so
100 * it is passed through to us here. This tells us which device we're
101 * dealing with in case we have more than one. */
102 struct blockdev
*bd
= _bd
;
105 /* We weren't doing anything? Strange, but could happen if we shared
106 * interrupts (we don't!). */
108 pr_debug("No work!\n");
112 /* Not done yet? That's equally strange. */
113 if (!bd
->lb_page
->result
) {
114 pr_debug("No result!\n");
118 /* We have to grab the lock before ending the request. */
119 spin_lock_irqsave(&bd
->lock
, flags
);
120 /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121 * to know whether this succeeded or not. */
122 end_entire_request(bd
->req
, bd
->lb_page
->result
== 1);
123 /* Clear out request, it's done. */
125 /* Reset incoming DMA for next time. */
126 bd
->dma
.used_len
= 0;
127 /* Ready for more reads or writes */
128 blk_start_queue(bd
->disk
->queue
);
129 spin_unlock_irqrestore(&bd
->lock
, flags
);
131 /* The interrupt was for us, we dealt with it. */
135 /*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136 * each of which contains "struct bio_vec"s, each of which contains a page, an
137 * offset and a length.
139 * Fortunately there are iterators to help us walk through the "struct
140 * request". Even more fortunately, there were plenty of places to steal the
141 * code from. We pack the "struct request" into our "struct lguest_dma" and
142 * return the total length. */
143 static unsigned int req_to_dma(struct request
*req
, struct lguest_dma
*dma
)
145 unsigned int i
= 0, len
= 0;
146 struct req_iterator iter
;
147 struct bio_vec
*bvec
;
149 rq_for_each_segment(bvec
, req
, iter
) {
150 /* We told the block layer not to give us too many. */
151 BUG_ON(i
== LGUEST_MAX_DMA_SECTIONS
);
152 /* If we had a zero-length segment, it would look like
153 * the end of the data referred to by the "struct
154 * lguest_dma", so make sure that doesn't happen. */
155 BUG_ON(!bvec
->bv_len
);
156 /* Convert page & offset to a physical address */
157 dma
->addr
[i
] = page_to_phys(bvec
->bv_page
)
159 dma
->len
[i
] = bvec
->bv_len
;
163 /* If the array isn't full, we mark the end with a 0 length */
164 if (i
< LGUEST_MAX_DMA_SECTIONS
)
169 /* This creates an empty DMA, useful for prodding the Host without sending data
170 * (ie. when we want to do a read) */
171 static void empty_dma(struct lguest_dma
*dma
)
176 /*D:470 Setting up a request is fairly easy: */
177 static void setup_req(struct blockdev
*bd
,
178 int type
, struct request
*req
, struct lguest_dma
*dma
)
180 /* The type is 1 (write) or 0 (read). */
181 bd
->lb_page
->type
= type
;
182 /* The sector on disk where the read or write starts. */
183 bd
->lb_page
->sector
= req
->sector
;
184 /* The result is initialized to 0 (unfinished). */
185 bd
->lb_page
->result
= 0;
186 /* The current request (so we can end it in the interrupt handler). */
188 /* The number of bytes: returned as a side-effect of req_to_dma(),
189 * which packs the block layer's "struct request" into our "struct
191 bd
->lb_page
->bytes
= req_to_dma(req
, dma
);
194 /*D:450 Write is pretty straightforward: we pack the request into a "struct
195 * lguest_dma", then use SEND_DMA to send the request. */
196 static void do_write(struct blockdev
*bd
, struct request
*req
)
198 struct lguest_dma send
;
200 pr_debug("lgb: WRITE sector %li\n", (long)req
->sector
);
201 setup_req(bd
, 1, req
, &send
);
203 lguest_send_dma(bd
->phys_addr
, &send
);
206 /* Read is similar to write, except we pack the request into our receive
207 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
208 * there's a request pending. */
209 static void do_read(struct blockdev
*bd
, struct request
*req
)
211 struct lguest_dma ping
;
213 pr_debug("lgb: READ sector %li\n", (long)req
->sector
);
214 setup_req(bd
, 0, req
, &bd
->dma
);
217 lguest_send_dma(bd
->phys_addr
, &ping
);
220 /*D:440 This where requests come in: we get handed the request queue and are
221 * expected to pull a "struct request" off it until we've finished them or
222 * we're waiting for a reply: */
223 static void do_lgb_request(struct request_queue
*q
)
229 /* This sometimes returns NULL even on the very first time around. I
230 * wonder if it's something to do with letting elves handle the request
232 req
= elv_next_request(q
);
236 /* We attached the struct blockdev to the disk: get it back */
237 bd
= req
->rq_disk
->private_data
;
238 /* Sometimes we get repeated requests after blk_stop_queue(), but we
239 * can only handle one at a time. */
243 /* We only do reads and writes: no tricky business! */
244 if (!blk_fs_request(req
)) {
245 pr_debug("Got non-command 0x%08x\n", req
->cmd_type
);
247 end_entire_request(req
, 0);
251 if (rq_data_dir(req
) == WRITE
)
256 /* We've put out the request, so stop any more coming in until we get
257 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
261 /*D:430 This is the "struct block_device_operations" we attach to the disk at
262 * the end of lguestblk_probe(). It doesn't seem to want much. */
263 static struct block_device_operations lguestblk_fops
= {
264 .owner
= THIS_MODULE
,
267 /*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
268 * quite why. I do know that the IDE code sent two or three of the maintainers
269 * insane, perhaps this is the fringe of the same disease?
271 * As in the console code, the probe function gets handed the generic
272 * lguest_device from lguest_bus.c: */
273 static int lguestblk_probe(struct lguest_device
*lgdev
)
277 int irqflags
= IRQF_SHARED
;
279 /* First we allocate our own "struct blockdev" and initialize the easy
281 bd
= kmalloc(sizeof(*bd
), GFP_KERNEL
);
285 spin_lock_init(&bd
->lock
);
286 bd
->irq
= lgdev_irq(lgdev
);
288 bd
->dma
.used_len
= 0;
290 /* The descriptor in the lguest_devices array provided by the Host
291 * gives the Guest the physical page number of the device's page. */
292 bd
->phys_addr
= (lguest_devices
[lgdev
->index
].pfn
<< PAGE_SHIFT
);
294 /* We use lguest_map() to get a pointer to the device page */
295 bd
->lb_page
= lguest_map(bd
->phys_addr
, 1);
301 /* We need a major device number: 0 means "assign one dynamically". */
302 bd
->major
= register_blkdev(0, "lguestblk");
308 /* This allocates a "struct gendisk" where we pack all the information
309 * about the disk which the rest of Linux sees. The argument is the
310 * number of minor devices desired: we need one minor for the main
311 * disk, and one for each partition. Of course, we can't possibly know
312 * how many partitions are on the disk (add_disk does that).
314 bd
->disk
= alloc_disk(16);
317 goto out_unregister_blkdev
;
320 /* Every disk needs a queue for requests to come in: we set up the
321 * queue with a callback function (the core of our driver) and the lock
323 bd
->disk
->queue
= blk_init_queue(do_lgb_request
, &bd
->lock
);
324 if (!bd
->disk
->queue
) {
329 /* We can only handle a certain number of pointers in our SEND_DMA
330 * call, so we set that with blk_queue_max_hw_segments(). This is not
331 * to be confused with blk_queue_max_phys_segments() of course! I
332 * know, who could possibly confuse the two?
334 * Well, it's simple to tell them apart: this one seems to work and the
335 * other one didn't. */
336 blk_queue_max_hw_segments(bd
->disk
->queue
, LGUEST_MAX_DMA_SECTIONS
);
338 /* Due to technical limitations of our Host (and simple coding) we
339 * can't have a single buffer which crosses a page boundary. Tell it
340 * here. This means that our maximum request size is 16
341 * (LGUEST_MAX_DMA_SECTIONS) pages. */
342 blk_queue_segment_boundary(bd
->disk
->queue
, PAGE_SIZE
-1);
344 /* We name our disk: this becomes the device name when udev does its
345 * magic thing and creates the device node, such as /dev/lgba.
346 * next_block_index is a global which starts at 'a'. Unfortunately
347 * this simple increment logic means that the 27th disk will be called
348 * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
349 * your /dev directory will be balanced. */
350 sprintf(bd
->disk
->disk_name
, "lgb%c", next_block_index
++);
352 /* We look to the device descriptor again to see if this device's
353 * interrupts are expected to be random. If they are, we tell the irq
354 * subsystem. At the moment this bit is always set. */
355 if (lguest_devices
[lgdev
->index
].features
& LGUEST_DEVICE_F_RANDOMNESS
)
356 irqflags
|= IRQF_SAMPLE_RANDOM
;
358 /* Now we have the name and irqflags, we can request the interrupt; we
359 * give it the "struct blockdev" we have set up to pass to lgb_irq()
360 * when there is an interrupt. */
361 err
= request_irq(bd
->irq
, lgb_irq
, irqflags
, bd
->disk
->disk_name
, bd
);
363 goto out_cleanup_queue
;
365 /* We bind our one-entry DMA pool to the key for this block device so
366 * the Host can reply to our requests. The key is equal to the
367 * physical address of the device's page, which is conveniently
369 err
= lguest_bind_dma(bd
->phys_addr
, &bd
->dma
, 1, bd
->irq
);
373 /* We finish our disk initialization and add the disk to the system. */
374 bd
->disk
->major
= bd
->major
;
375 bd
->disk
->first_minor
= 0;
376 bd
->disk
->private_data
= bd
;
377 bd
->disk
->fops
= &lguestblk_fops
;
378 /* This is initialized to the disk size by the Launcher. */
379 set_capacity(bd
->disk
, bd
->lb_page
->num_sectors
);
382 printk(KERN_INFO
"%s: device %i at major %d\n",
383 bd
->disk
->disk_name
, lgdev
->index
, bd
->major
);
385 /* We don't need to keep the "struct blockdev" around, but if we ever
386 * implemented device removal, we'd need this. */
391 free_irq(bd
->irq
, bd
);
393 blk_cleanup_queue(bd
->disk
->queue
);
396 out_unregister_blkdev
:
397 unregister_blkdev(bd
->major
, "lguestblk");
399 lguest_unmap(bd
->lb_page
);
405 /*D:410 The boilerplate code for registering the lguest block driver is just
406 * like the console: */
407 static struct lguest_driver lguestblk_drv
= {
409 .owner
= THIS_MODULE
,
410 .device_type
= LGUEST_DEVICE_T_BLOCK
,
411 .probe
= lguestblk_probe
,
414 static __init
int lguestblk_init(void)
416 return register_lguest_driver(&lguestblk_drv
);
418 module_init(lguestblk_init
);
420 MODULE_DESCRIPTION("Lguest block driver");
421 MODULE_LICENSE("GPL");