2 * Device operations for the pnfs nfs4 file layout driver.
5 * The Regents of the University of Michigan
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
31 #include <linux/nfs_fs.h>
32 #include <linux/vmalloc.h>
35 #include "nfs4filelayout.h"
37 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
42 #define NFS4_FL_DEVICE_ID_HASH_BITS 5
43 #define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44 #define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
47 nfs4_fl_deviceid_hash(struct nfs4_deviceid
*id
)
49 unsigned char *cptr
= (unsigned char *)id
->data
;
50 unsigned int nbytes
= NFS4_DEVICEID4_SIZE
;
57 return x
& NFS4_FL_DEVICE_ID_HASH_MASK
;
60 static struct hlist_head filelayout_deviceid_cache
[NFS4_FL_DEVICE_ID_HASH_SIZE
];
61 static DEFINE_SPINLOCK(filelayout_deviceid_lock
);
66 * Data servers can be mapped to different device ids.
67 * nfs4_pnfs_ds reference counting
68 * - set to 1 on allocation
69 * - incremented when a device id maps a data server already in the cache.
70 * - decremented when deviceid is removed from the cache.
72 DEFINE_SPINLOCK(nfs4_ds_cache_lock
);
73 static LIST_HEAD(nfs4_data_server_cache
);
77 print_ds(struct nfs4_pnfs_ds
*ds
)
80 printk("%s NULL device\n", __func__
);
83 printk(" ip_addr %x port %hu\n"
86 " cl_exchange_flags %x\n",
87 ntohl(ds
->ds_ip_addr
), ntohs(ds
->ds_port
),
88 atomic_read(&ds
->ds_count
), ds
->ds_clp
,
89 ds
->ds_clp
? ds
->ds_clp
->cl_exchange_flags
: 0);
93 print_ds_list(struct nfs4_file_layout_dsaddr
*dsaddr
)
98 printk("%s dsaddr->ds_num %d\n", __func__
,
100 for (i
= 0; i
< dsaddr
->ds_num
; i
++)
101 print_ds(dsaddr
->ds_list
[i
]);
105 void print_deviceid(struct nfs4_deviceid
*id
)
109 dprintk("%s: device id= [%x%x%x%x]\n", __func__
,
110 p
[0], p
[1], p
[2], p
[3]);
113 /* nfs4_ds_cache_lock is held */
114 static struct nfs4_pnfs_ds
*
115 _data_server_lookup_locked(u32 ip_addr
, u32 port
)
117 struct nfs4_pnfs_ds
*ds
;
119 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
120 ntohl(ip_addr
), ntohs(port
));
122 list_for_each_entry(ds
, &nfs4_data_server_cache
, ds_node
) {
123 if (ds
->ds_ip_addr
== ip_addr
&&
124 ds
->ds_port
== port
) {
132 * Create an rpc connection to the nfs4_pnfs_ds data server
133 * Currently only support IPv4
136 nfs4_ds_connect(struct nfs_server
*mds_srv
, struct nfs4_pnfs_ds
*ds
)
138 struct nfs_client
*clp
;
139 struct sockaddr_in sin
;
142 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__
,
143 ntohl(ds
->ds_ip_addr
), ntohs(ds
->ds_port
),
144 mds_srv
->nfs_client
->cl_rpcclient
->cl_auth
->au_flavor
);
146 sin
.sin_family
= AF_INET
;
147 sin
.sin_addr
.s_addr
= ds
->ds_ip_addr
;
148 sin
.sin_port
= ds
->ds_port
;
150 clp
= nfs4_set_ds_client(mds_srv
->nfs_client
, (struct sockaddr
*)&sin
,
151 sizeof(sin
), IPPROTO_TCP
);
153 status
= PTR_ERR(clp
);
157 if ((clp
->cl_exchange_flags
& EXCHGID4_FLAG_MASK_PNFS
) != 0) {
158 if (!is_ds_client(clp
)) {
163 dprintk("%s [existing] ip=%x, port=%hu\n", __func__
,
164 ntohl(ds
->ds_ip_addr
), ntohs(ds
->ds_port
));
169 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
170 * be equal to the MDS lease. Renewal is scheduled in create_session.
172 spin_lock(&mds_srv
->nfs_client
->cl_lock
);
173 clp
->cl_lease_time
= mds_srv
->nfs_client
->cl_lease_time
;
174 spin_unlock(&mds_srv
->nfs_client
->cl_lock
);
175 clp
->cl_last_renewal
= jiffies
;
178 status
= nfs4_init_ds_session(clp
);
183 dprintk("%s [new] ip=%x, port=%hu\n", __func__
, ntohl(ds
->ds_ip_addr
),
193 destroy_ds(struct nfs4_pnfs_ds
*ds
)
195 dprintk("--> %s\n", __func__
);
200 nfs_put_client(ds
->ds_clp
);
205 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr
*dsaddr
)
207 struct nfs4_pnfs_ds
*ds
;
210 print_deviceid(&dsaddr
->deviceid
);
212 for (i
= 0; i
< dsaddr
->ds_num
; i
++) {
213 ds
= dsaddr
->ds_list
[i
];
215 if (atomic_dec_and_lock(&ds
->ds_count
,
216 &nfs4_ds_cache_lock
)) {
217 list_del_init(&ds
->ds_node
);
218 spin_unlock(&nfs4_ds_cache_lock
);
223 kfree(dsaddr
->stripe_indices
);
227 static struct nfs4_pnfs_ds
*
228 nfs4_pnfs_ds_add(struct inode
*inode
, u32 ip_addr
, u32 port
)
230 struct nfs4_pnfs_ds
*tmp_ds
, *ds
;
232 ds
= kzalloc(sizeof(*tmp_ds
), GFP_KERNEL
);
236 spin_lock(&nfs4_ds_cache_lock
);
237 tmp_ds
= _data_server_lookup_locked(ip_addr
, port
);
238 if (tmp_ds
== NULL
) {
239 ds
->ds_ip_addr
= ip_addr
;
241 atomic_set(&ds
->ds_count
, 1);
242 INIT_LIST_HEAD(&ds
->ds_node
);
244 list_add(&ds
->ds_node
, &nfs4_data_server_cache
);
245 dprintk("%s add new data server ip 0x%x\n", __func__
,
249 atomic_inc(&tmp_ds
->ds_count
);
250 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
251 __func__
, tmp_ds
->ds_ip_addr
,
252 atomic_read(&tmp_ds
->ds_count
));
255 spin_unlock(&nfs4_ds_cache_lock
);
261 * Currently only support ipv4, and one multi-path address.
263 static struct nfs4_pnfs_ds
*
264 decode_and_add_ds(struct xdr_stream
*streamp
, struct inode
*inode
)
266 struct nfs4_pnfs_ds
*ds
= NULL
;
268 const char *ipend
, *pstr
;
275 p
= xdr_inline_decode(streamp
, 4);
278 nlen
= be32_to_cpup(p
++);
280 p
= xdr_inline_decode(streamp
, nlen
);
284 /* Check that netid is "tcp" */
285 if (nlen
!= 3 || memcmp((char *)p
, "tcp", 3)) {
286 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__
);
291 p
= xdr_inline_decode(streamp
, 4);
294 rlen
= be32_to_cpup(p
);
296 p
= xdr_inline_decode(streamp
, rlen
);
300 /* ipv6 length plus port is legal */
301 if (rlen
> INET6_ADDRSTRLEN
+ 8) {
302 dprintk("%s: Invalid address, length %d\n", __func__
,
306 buf
= kmalloc(rlen
+ 1, GFP_KERNEL
);
308 dprintk("%s: Not enough memory\n", __func__
);
312 memcpy(buf
, p
, rlen
);
314 /* replace the port dots with dashes for the in4_pton() delimiter*/
315 for (i
= 0; i
< 2; i
++) {
316 char *res
= strrchr(buf
, '.');
318 dprintk("%s: Failed finding expected dots in port\n",
325 /* Currently only support ipv4 address */
326 if (in4_pton(buf
, rlen
, (u8
*)&ip_addr
, '-', &ipend
) == 0) {
327 dprintk("%s: Only ipv4 addresses supported\n", __func__
);
333 sscanf(pstr
, "-%d-%d", &tmp
[0], &tmp
[1]);
334 port
= htons((tmp
[0] << 8) | (tmp
[1]));
336 ds
= nfs4_pnfs_ds_add(inode
, ip_addr
, port
);
337 dprintk("%s: Decoded address and port %s\n", __func__
, buf
);
344 /* Decode opaque device data and return the result */
345 static struct nfs4_file_layout_dsaddr
*
346 decode_device(struct inode
*ino
, struct pnfs_device
*pdev
)
354 struct nfs4_file_layout_dsaddr
*dsaddr
= NULL
;
355 struct xdr_stream stream
;
356 struct xdr_buf buf
= {
357 .pages
= pdev
->pages
,
358 .page_len
= pdev
->pglen
,
359 .buflen
= pdev
->pglen
,
362 struct page
*scratch
;
364 /* set up xdr stream */
365 scratch
= alloc_page(GFP_KERNEL
);
369 xdr_init_decode(&stream
, &buf
, NULL
);
370 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
372 /* Get the stripe count (number of stripe index) */
373 p
= xdr_inline_decode(&stream
, 4);
375 goto out_err_free_scratch
;
377 cnt
= be32_to_cpup(p
);
378 dprintk("%s stripe count %d\n", __func__
, cnt
);
379 if (cnt
> NFS4_PNFS_MAX_STRIPE_CNT
) {
380 printk(KERN_WARNING
"%s: stripe count %d greater than "
381 "supported maximum %d\n", __func__
,
382 cnt
, NFS4_PNFS_MAX_STRIPE_CNT
);
383 goto out_err_free_scratch
;
386 /* read stripe indices */
387 stripe_indices
= kcalloc(cnt
, sizeof(u8
), GFP_KERNEL
);
389 goto out_err_free_scratch
;
391 p
= xdr_inline_decode(&stream
, cnt
<< 2);
393 goto out_err_free_stripe_indices
;
395 indexp
= &stripe_indices
[0];
396 max_stripe_index
= 0;
397 for (i
= 0; i
< cnt
; i
++) {
398 *indexp
= be32_to_cpup(p
++);
399 max_stripe_index
= max(max_stripe_index
, *indexp
);
403 /* Check the multipath list count */
404 p
= xdr_inline_decode(&stream
, 4);
406 goto out_err_free_stripe_indices
;
408 num
= be32_to_cpup(p
);
409 dprintk("%s ds_num %u\n", __func__
, num
);
410 if (num
> NFS4_PNFS_MAX_MULTI_CNT
) {
411 printk(KERN_WARNING
"%s: multipath count %d greater than "
412 "supported maximum %d\n", __func__
,
413 num
, NFS4_PNFS_MAX_MULTI_CNT
);
414 goto out_err_free_stripe_indices
;
417 /* validate stripe indices are all < num */
418 if (max_stripe_index
>= num
) {
419 printk(KERN_WARNING
"%s: stripe index %u >= num ds %u\n",
420 __func__
, max_stripe_index
, num
);
421 goto out_err_free_stripe_indices
;
424 dsaddr
= kzalloc(sizeof(*dsaddr
) +
425 (sizeof(struct nfs4_pnfs_ds
*) * (num
- 1)),
428 goto out_err_free_stripe_indices
;
430 dsaddr
->stripe_count
= cnt
;
431 dsaddr
->stripe_indices
= stripe_indices
;
432 stripe_indices
= NULL
;
433 dsaddr
->ds_num
= num
;
435 memcpy(&dsaddr
->deviceid
, &pdev
->dev_id
, sizeof(pdev
->dev_id
));
437 for (i
= 0; i
< dsaddr
->ds_num
; i
++) {
441 p
= xdr_inline_decode(&stream
, 4);
443 goto out_err_free_deviceid
;
445 mp_count
= be32_to_cpup(p
); /* multipath count */
448 "%s: Multipath count %d not supported, "
449 "skipping all greater than 1\n", __func__
,
452 for (j
= 0; j
< mp_count
; j
++) {
454 dsaddr
->ds_list
[i
] = decode_and_add_ds(&stream
,
456 if (dsaddr
->ds_list
[i
] == NULL
)
457 goto out_err_free_deviceid
;
460 /* skip extra multipath */
463 p
= xdr_inline_decode(&stream
, 4);
465 goto out_err_free_deviceid
;
466 len
= be32_to_cpup(p
);
468 p
= xdr_inline_decode(&stream
, len
);
470 goto out_err_free_deviceid
;
473 p
= xdr_inline_decode(&stream
, 4);
475 goto out_err_free_deviceid
;
476 len
= be32_to_cpup(p
);
478 p
= xdr_inline_decode(&stream
, len
);
480 goto out_err_free_deviceid
;
485 __free_page(scratch
);
488 out_err_free_deviceid
:
489 nfs4_fl_free_deviceid(dsaddr
);
490 /* stripe_indicies was part of dsaddr */
491 goto out_err_free_scratch
;
492 out_err_free_stripe_indices
:
493 kfree(stripe_indices
);
494 out_err_free_scratch
:
495 __free_page(scratch
);
497 dprintk("%s ERROR: returning NULL\n", __func__
);
502 * Decode the opaque device specified in 'dev' and add it to the cache of
505 static struct nfs4_file_layout_dsaddr
*
506 decode_and_add_device(struct inode
*inode
, struct pnfs_device
*dev
)
508 struct nfs4_file_layout_dsaddr
*d
, *new;
511 new = decode_device(inode
, dev
);
513 printk(KERN_WARNING
"%s: Could not decode or add device\n",
518 spin_lock(&filelayout_deviceid_lock
);
519 d
= nfs4_fl_find_get_deviceid(&new->deviceid
);
521 spin_unlock(&filelayout_deviceid_lock
);
522 nfs4_fl_free_deviceid(new);
526 INIT_HLIST_NODE(&new->node
);
527 atomic_set(&new->ref
, 1);
528 hash
= nfs4_fl_deviceid_hash(&new->deviceid
);
529 hlist_add_head_rcu(&new->node
, &filelayout_deviceid_cache
[hash
]);
530 spin_unlock(&filelayout_deviceid_lock
);
536 * Retrieve the information for dev_id, add it to the list
537 * of available devices, and return it.
539 struct nfs4_file_layout_dsaddr
*
540 get_device_info(struct inode
*inode
, struct nfs4_deviceid
*dev_id
)
542 struct pnfs_device
*pdev
= NULL
;
545 struct page
**pages
= NULL
;
546 struct nfs4_file_layout_dsaddr
*dsaddr
= NULL
;
548 struct nfs_server
*server
= NFS_SERVER(inode
);
551 * Use the session max response size as the basis for setting
552 * GETDEVICEINFO's maxcount
554 max_resp_sz
= server
->nfs_client
->cl_session
->fc_attrs
.max_resp_sz
;
555 max_pages
= max_resp_sz
>> PAGE_SHIFT
;
556 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
557 __func__
, inode
, max_resp_sz
, max_pages
);
559 pdev
= kzalloc(sizeof(struct pnfs_device
), GFP_KERNEL
);
563 pages
= kzalloc(max_pages
* sizeof(struct page
*), GFP_KERNEL
);
568 for (i
= 0; i
< max_pages
; i
++) {
569 pages
[i
] = alloc_page(GFP_KERNEL
);
574 memcpy(&pdev
->dev_id
, dev_id
, sizeof(*dev_id
));
575 pdev
->layout_type
= LAYOUT_NFSV4_1_FILES
;
578 pdev
->pglen
= PAGE_SIZE
* max_pages
;
581 rc
= nfs4_proc_getdeviceinfo(server
, pdev
);
582 dprintk("%s getdevice info returns %d\n", __func__
, rc
);
587 * Found new device, need to decode it and then add it to the
588 * list of known devices for this mountpoint.
590 dsaddr
= decode_and_add_device(inode
, pdev
);
592 for (i
= 0; i
< max_pages
; i
++)
593 __free_page(pages
[i
]);
596 dprintk("<-- %s dsaddr %p\n", __func__
, dsaddr
);
601 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr
*dsaddr
)
603 if (atomic_dec_and_lock(&dsaddr
->ref
, &filelayout_deviceid_lock
)) {
604 hlist_del_rcu(&dsaddr
->node
);
605 spin_unlock(&filelayout_deviceid_lock
);
608 nfs4_fl_free_deviceid(dsaddr
);
612 struct nfs4_file_layout_dsaddr
*
613 nfs4_fl_find_get_deviceid(struct nfs4_deviceid
*id
)
615 struct nfs4_file_layout_dsaddr
*d
;
616 struct hlist_node
*n
;
617 long hash
= nfs4_fl_deviceid_hash(id
);
621 hlist_for_each_entry_rcu(d
, n
, &filelayout_deviceid_cache
[hash
], node
) {
622 if (!memcmp(&d
->deviceid
, id
, sizeof(*id
))) {
623 if (!atomic_inc_not_zero(&d
->ref
))
635 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
636 * Then: ((res + fsi) % dsaddr->stripe_count)
639 nfs4_fl_calc_j_index(struct pnfs_layout_segment
*lseg
, loff_t offset
)
641 struct nfs4_filelayout_segment
*flseg
= FILELAYOUT_LSEG(lseg
);
644 tmp
= offset
- flseg
->pattern_offset
;
645 do_div(tmp
, flseg
->stripe_unit
);
646 tmp
+= flseg
->first_stripe_index
;
647 return do_div(tmp
, flseg
->dsaddr
->stripe_count
);
651 nfs4_fl_calc_ds_index(struct pnfs_layout_segment
*lseg
, u32 j
)
653 return FILELAYOUT_LSEG(lseg
)->dsaddr
->stripe_indices
[j
];
657 nfs4_fl_select_ds_fh(struct pnfs_layout_segment
*lseg
, u32 j
)
659 struct nfs4_filelayout_segment
*flseg
= FILELAYOUT_LSEG(lseg
);
662 if (flseg
->stripe_type
== STRIPE_SPARSE
) {
663 if (flseg
->num_fh
== 1)
665 else if (flseg
->num_fh
== 0)
666 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
669 i
= nfs4_fl_calc_ds_index(lseg
, j
);
672 return flseg
->fh_array
[i
];
676 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr
*dsaddr
,
677 int err
, u32 ds_addr
)
679 u32
*p
= (u32
*)&dsaddr
->deviceid
;
681 printk(KERN_ERR
"NFS: data server %x connection error %d."
682 " Deviceid [%x%x%x%x] marked out of use.\n",
683 ds_addr
, err
, p
[0], p
[1], p
[2], p
[3]);
685 spin_lock(&filelayout_deviceid_lock
);
686 dsaddr
->flags
|= NFS4_DEVICE_ID_NEG_ENTRY
;
687 spin_unlock(&filelayout_deviceid_lock
);
690 struct nfs4_pnfs_ds
*
691 nfs4_fl_prepare_ds(struct pnfs_layout_segment
*lseg
, u32 ds_idx
)
693 struct nfs4_file_layout_dsaddr
*dsaddr
= FILELAYOUT_LSEG(lseg
)->dsaddr
;
694 struct nfs4_pnfs_ds
*ds
= dsaddr
->ds_list
[ds_idx
];
697 printk(KERN_ERR
"%s: No data server for offset index %d\n",
703 struct nfs_server
*s
= NFS_SERVER(lseg
->pls_layout
->plh_inode
);
706 if (dsaddr
->flags
& NFS4_DEVICE_ID_NEG_ENTRY
) {
707 /* Already tried to connect, don't try again */
708 dprintk("%s Deviceid marked out of use\n", __func__
);
711 err
= nfs4_ds_connect(s
, ds
);
713 filelayout_mark_devid_negative(dsaddr
, err
,
714 ntohl(ds
->ds_ip_addr
));