docs: Tweak location of qemu nbd extensions
[nbd.git] / nbd-server.c
blob2818f2fe28167a6ee18b143420fe91abd6f345b5
1 /*
2 * Network Block Device - server
4 * Copyright 1996-1998 Pavel Machek, distribute under GPL
5 * <pavel@atrey.karlin.mff.cuni.cz>
6 * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7 * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
9 * Version 1.0 - hopefully 64-bit-clean
10 * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11 * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12 * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13 * type, or don't have 64 bit file offsets by defining FS_32BIT
14 * in compile options for nbd-server *only*. This can be done
15 * with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16 * original autoconf input file, or I would make it a configure
17 * option.) Ken Yap <ken@nlc.net.au>.
18 * Version 1.6 - fix autodetection of block device size and really make 64 bit
19 * clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20 * Version 2.0 - Version synchronised with client
21 * Version 2.1 - Reap zombie client processes when they exit. Removed
22 * (uncommented) the _IO magic, it's no longer necessary. Wouter
23 * Verhelst <wouter@debian.org>
24 * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25 * Version 2.3 - Fixed code so that Large File Support works. This
26 * removes the FS_32BIT compile-time directive; define
27 * _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28 * using FS_32BIT. This will allow you to use files >2GB instead of
29 * having to use the -m option. Wouter Verhelst <wouter@debian.org>
30 * Version 2.4 - Added code to keep track of children, so that we can
31 * properly kill them from initscripts. Add a call to daemon(),
32 * so that processes don't think they have to wait for us, which is
33 * interesting for initscripts as well. Wouter Verhelst
34 * <wouter@debian.org>
35 * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36 * zero after fork()ing, resulting in nbd-server going berserk
37 * when it receives a signal with at least one child open. Wouter
38 * Verhelst <wouter@debian.org>
39 * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40 * rectified type of mainloop::size_host (sf.net bugs 814435 and
41 * 817385); close the PID file after writing to it, so that the
42 * daemon can actually be found. Wouter Verhelst
43 * <wouter@debian.org>
44 * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45 * correctly put in network endianness. Many types were corrected
46 * (size_t and off_t instead of int). <vspaceg@sourceforge.net>
47 * Version 2.6 - Some code cleanup.
48 * Version 2.7 - Better build system.
49 * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a
50 * lot more work, but this is a start. Wouter Verhelst
51 * <wouter@debian.org>
52 * 16/03/2010 - Add IPv6 support.
53 * Kitt Tientanopajai <kitt@kitty.in.th>
54 * Neutron Soutmun <neo.neutron@gmail.com>
55 * Suriya Soutmun <darksolar@gmail.com>
58 /* Includes LFS defines, which defines behaviours of some of the following
59 * headers, so must come before those */
60 #include "lfs.h"
61 #define _DEFAULT_SOURCE
62 #define _XOPEN_SOURCE 500 /* to get pread/pwrite */
63 #if NEED_BSD_SOURCE
64 #define _BSD_SOURCE /* to get DT_* macros on some platforms */
65 #endif
66 #define _DARWIN_C_SOURCE /* to get DT_* macros on OS X */
68 #include <assert.h>
69 #include <sys/types.h>
70 #include <sys/socket.h>
71 #include <sys/stat.h>
72 #include <sys/select.h>
73 #include <sys/wait.h>
74 #include <sys/un.h>
75 #ifdef HAVE_SYS_IOCTL_H
76 #include <sys/ioctl.h>
77 #endif
78 #ifdef HAVE_SYS_UIO_H
79 #include <sys/uio.h>
80 #endif
81 #include <sys/param.h>
82 #include <signal.h>
83 #include <errno.h>
84 #include <libgen.h>
85 #include <netinet/tcp.h>
86 #include <netinet/in.h>
87 #include <netdb.h>
88 #include <syslog.h>
89 #include <unistd.h>
90 #include <stdbool.h>
91 #include <stdio.h>
92 #include <stdlib.h>
93 #include <string.h>
94 #include <fcntl.h>
95 #if HAVE_FALLOC_PH
96 #include <linux/falloc.h>
97 #endif
98 #if HAVE_BLKDISCARD
99 #include <linux/fs.h>
100 #endif
101 #include <arpa/inet.h>
102 #include <strings.h>
103 #include <dirent.h>
104 #ifdef HAVE_SYS_DIR_H
105 #include <sys/dir.h>
106 #endif
107 #ifdef HAVE_SYS_DIRENT_H
108 #include <sys/dirent.h>
109 #endif
110 #include <getopt.h>
111 #include <pwd.h>
112 #include <grp.h>
113 #include <dirent.h>
114 #include <ctype.h>
115 #include <inttypes.h>
117 #include <glib.h>
119 /* used in cliserv.h, so must come first */
120 #define MY_NAME "nbd_server"
121 #include "cliserv.h"
122 #include "nbd-debug.h"
123 #include "netdb-compat.h"
124 #include "backend.h"
125 #include "treefiles.h"
126 #include "nbd-helper.h"
128 #ifdef WITH_SDP
129 #include <sdp_inet.h>
130 #endif
132 #if HAVE_FSCTL_SET_ZERO_DATA
133 #include <io.h>
134 /* don't include <windows.h> to avoid redefining eg the ERROR macro */
135 #define NOMINMAX 1
136 #include <windef.h>
137 #include <winbase.h>
138 #include <winioctl.h>
139 #endif
141 /** Default position of the config file */
142 #ifndef SYSCONFDIR
143 #define SYSCONFDIR "/etc"
144 #endif
145 #define CFILE SYSCONFDIR "/nbd-server/config"
147 #if HAVE_GNUTLS
148 #include <gnutls/gnutls.h>
149 #include <gnutls/x509.h>
150 #endif
152 #ifndef HAVE_G_MEMDUP2
153 /* Our uses of g_memdup2 below are safe from g_memdup's 32-bit overflow */
154 #define g_memdup2 g_memdup
155 #endif
158 * Shorten error handling and regular function return sequences
159 * automatically freeing dynamically allocated resources
161 #define _cleanup_(x) __attribute__((__cleanup__(x)))
162 static inline void g_freep(void *p) {
163 g_free(*(void**) p);
165 #define _cleanup_g_free_ _cleanup_(g_freep)
166 #define DEFINE_TRIVIAL_CLEANUP_FUNC(type, func) \
167 static inline void func##p(type *p) { \
168 if (*p) \
169 func(*p); \
171 DEFINE_TRIVIAL_CLEANUP_FUNC(GKeyFile*, g_key_file_free)
172 DEFINE_TRIVIAL_CLEANUP_FUNC(gchar **, g_strfreev)
174 /** Where our config file actually is */
175 gchar* config_file_pos;
177 /** global flags */
178 int glob_flags=0;
180 /* Whether we should avoid daemonizing the main process */
181 int nodaemon = 0;
183 /* Whether we should avoid forking into child processes */
184 int dontfork = 0;
187 * The highest value a variable of type off_t can reach. This is a signed
188 * integer, so set all bits except for the leftmost one.
190 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
191 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
192 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
194 /** Global flags: */
195 #define F_OLDSTYLE 1 /**< Allow oldstyle (port-based) exports */
196 #define F_LIST 2 /**< Allow clients to list the exports on a server */
197 #define F_NO_ZEROES 4 /**< Do not send zeros to client */
198 #define F_DUAL_LISTEN 8 /**< Listen on both TCP and unix socket */
199 // also accepts F_FORCEDTLS (which is 16384)
200 GHashTable *children;
201 char pidfname[256]; /**< name of our PID file */
202 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
204 #define NEG_INIT (1 << 0)
205 #define NEG_OLD (1 << 1)
206 #define NEG_MODERN (1 << 2)
209 * If we want what the system really has set we'd have to read
210 * /proc/sys/fs/pipe-max-size, but for now 1mb should be enough.
212 #define MAX_PIPE_SIZE (1 * 1024 * 1024)
213 #define SPLICE_IN 0
214 #define SPLICE_OUT 1
216 #include <nbdsrv.h>
218 /* Our thread pool */
219 GThreadPool *tpool = NULL;
221 /* A work package for the thread pool functions */
222 struct work_package {
223 CLIENT* client;
224 struct nbd_request* req;
225 int pipefd[2];
226 void* data; /**< for write requests */
229 static volatile sig_atomic_t is_sigchld_caught; /**< Flag set by
230 SIGCHLD handler
231 to mark a child
232 exit */
234 static volatile sig_atomic_t is_sigterm_caught; /**< Flag set by
235 SIGTERM handler
236 to mark a exit
237 request */
239 static volatile sig_atomic_t is_sighup_caught; /**< Flag set by SIGHUP
240 handler to mark a
241 reconfiguration
242 request */
244 GArray* modernsocks; /**< Sockets for the modern handler. Not used
245 if a client was only specified on the
246 command line; only port used if
247 oldstyle is set to false (and then the
248 command-line client isn't used, gna gna).
249 This may be more than one socket on
250 systems that don't support serving IPv4
251 and IPv6 from the same socket (like,
252 e.g., FreeBSD) */
253 GArray* childsocks; /**< parent-side sockets for communication with children */
254 int commsocket; /**< child-side socket for communication with parent */
255 static sem_t file_wait_sem;
257 bool logged_oversized=false; /**< whether we logged oversized requests already */
260 * Type of configuration file values
262 typedef enum {
263 PARAM_INT, /**< This parameter is an integer */
264 PARAM_INT64, /**< This parameter is an integer */
265 PARAM_STRING, /**< This parameter is a string */
266 PARAM_BOOL, /**< This parameter is a boolean */
267 } PARAM_TYPE;
270 * Configuration file values
272 typedef struct {
273 gchar *paramname; /**< Name of the parameter, as it appears in
274 the config file */
275 gboolean required; /**< Whether this is a required (as opposed to
276 optional) parameter */
277 PARAM_TYPE ptype; /**< Type of the parameter. */
278 gpointer target; /**< Pointer to where the data of this
279 parameter should be written. If ptype is
280 PARAM_BOOL, the data is or'ed rather than
281 overwritten. */
282 gint flagval; /**< Flag mask for this parameter in case ptype
283 is PARAM_BOOL. */
284 } PARAM;
287 * Configuration file values of the "generic" section
289 struct generic_conf {
290 gchar *user; /**< user we run the server as */
291 gchar *group; /**< group we run running as */
292 gchar *modernaddr; /**< address of the modern socket */
293 gchar *modernport; /**< port of the modern socket */
294 gchar *unixsock; /**< file name of the unix domain socket */
295 gchar *certfile; /**< certificate file */
296 gchar *keyfile; /**< key file */
297 gchar *cacertfile; /**< CA certificate file */
298 gchar *tlsprio; /**< TLS priority string */
299 gint flags; /**< global flags */
300 gint threads; /**< maximum number of parallel threads we want to run */
303 #if HAVE_GNUTLS
304 static int writeit_tls(gnutls_session_t s, const void *buf, size_t len) {
305 _cleanup_g_free_ char *m = NULL;
306 ssize_t res;
307 while(len > 0) {
308 DEBUG("+");
309 if ((res = gnutls_record_send(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
310 m = g_strdup_printf("issue while sending data: %s", gnutls_strerror(res));
311 err_nonfatal(m);
312 } else if(res < 0) {
313 m = g_strdup_printf("could not send data: %s", gnutls_strerror(res));
314 err_nonfatal(m);
315 return -1;
316 } else {
317 len -= res;
318 buf += res;
321 return 0;
324 static int readit_tls(gnutls_session_t s, void *buf, size_t len) {
325 _cleanup_g_free_ char *m = NULL;
326 ssize_t res;
327 while(len > 0) {
328 DEBUG("*");
329 if((res = gnutls_record_recv(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
330 m = g_strdup_printf("issue while receiving data: %s", gnutls_strerror(res));
331 err_nonfatal(m);
332 } else if(res < 0) {
333 m = g_strdup_printf("could not receive data: %s", gnutls_strerror(res));
334 err_nonfatal(m);
335 return -1;
336 } else {
337 len -= res;
338 buf += res;
341 return 0;
344 static int socket_read_tls(CLIENT* client, void *buf, size_t len) {
345 return readit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
348 static int socket_write_tls(CLIENT* client, const void *buf, size_t len) {
349 return writeit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
351 #endif // HAVE_GNUTLS
353 static int socket_read_notls(CLIENT* client, void *buf, size_t len) {
354 return readit(client->net, buf, len);
357 static int socket_write_notls(CLIENT* client, const void *buf, size_t len) {
358 return writeit(client->net, buf, len);
361 static void socket_read(CLIENT* client, void *buf, size_t len) {
362 g_assert(client->socket_read != NULL);
363 if(client->socket_read(client, buf, len)<0) {
364 g_assert(client->socket_closed != NULL);
365 client->socket_closed(client);
370 * Consume data from a socket that we don't want
372 * @param c the client to read from
373 * @param len the number of bytes to consume
374 * @param buf a buffer
375 * @param bufsiz the size of the buffer
377 static inline void consume(CLIENT* c, size_t len, void * buf, size_t bufsiz) {
378 size_t curlen;
379 while (len>0) {
380 curlen = (len>bufsiz)?bufsiz:len;
381 socket_read(c, buf, curlen);
382 len -= curlen;
387 * Consume a length field and corresponding payload that we don't want
389 * @param c the client to read from
391 static inline void consume_len(CLIENT* c) {
392 uint32_t len;
393 char buf[1024];
395 socket_read(c, &len, sizeof(len));
396 len = ntohl(len);
397 consume(c, len, buf, sizeof(buf));
400 static void socket_write(CLIENT* client, const void *buf, size_t len) {
401 g_assert(client->socket_write != NULL);
402 if(client->socket_write(client, buf, len)<0) {
403 g_assert(client->socket_closed != NULL);
404 client->socket_closed(client);
408 static inline void socket_closed_negotiate(CLIENT* client) {
409 err("Negotiation failed: %m");
412 static void cleanup_transactionlog(CLIENT *client) {
414 if (client->transactionlogfd != -1) {
415 close(client->transactionlogfd);
416 client->transactionlogfd = -1;
418 if (client->logsem != SEM_FAILED) {
419 sem_close(client->logsem);
420 client->logsem = SEM_FAILED;
421 sem_unlink(client->semname);
425 static void lock_logsem(CLIENT *client) {
426 sem_wait(client->logsem);
428 static void unlock_logsem(CLIENT *client) {
429 sem_post(client->logsem);
433 * Run a command. This is used for the ``prerun'' and ``postrun'' config file
434 * options
436 * @param command the command to be ran. Read from the config file
437 * @param file the file name we're about to export
439 int do_run(gchar* command, gchar* file) {
440 _cleanup_g_free_ gchar* cmd = NULL;
441 int retval=0;
443 if(command && *command) {
444 cmd = g_strdup_printf(command, file);
445 retval=system(cmd);
447 return retval;
450 static inline void finalize_client(CLIENT* client) {
451 g_thread_pool_free(tpool, FALSE, TRUE);
452 do_run(client->server->postrun, client->exportname);
453 if(client->transactionlogfd != -1)
454 cleanup_transactionlog(client);
456 if(client->server->flags & F_COPYONWRITE) {
457 unlink(client->difffilename);
459 serve_dec_ref(client->server);
462 static inline void socket_closed_transmission(CLIENT* client) {
463 int saved_errno = errno;
464 finalize_client(client);
465 errno = saved_errno;
466 err("Connection dropped: %m");
469 #ifdef HAVE_SPLICE
471 * Splice data between a pipe and a file descriptor
473 * @param fd_in The fd to splice from.
474 * @param off_in The fd_in offset to splice from.
475 * @param fd_out The fd to splice to.
476 * @param off_out The fd_out offset to splice to.
477 * @param len The length to splice.
479 static inline void spliceit(int fd_in, loff_t *off_in, int fd_out,
480 loff_t *off_out, size_t len)
482 ssize_t ret;
483 while (len > 0) {
484 if ((ret = splice(fd_in, off_in, fd_out, off_out, len,
485 SPLICE_F_MOVE)) <= 0)
486 err("Splice failed: %m");
487 len -= ret;
490 #endif
493 * Print out a message about how to use nbd-server. Split out to a separate
494 * function so that we can call it from multiple places
496 void usage() {
497 printf("This is nbd-server version " VERSION "\n");
498 printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections] [-V] [-n] [-d]\n"
499 "\t-r|--read-only\t\tread only\n"
500 "\t-m|--multi-file\t\tmultiple file\n"
501 "\t-c|--copy-on-write\tcopy on write\n"
502 "\t-C|--config-file\tspecify an alternate configuration file\n"
503 "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
504 "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
505 "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
506 "\t-M|--max-connection\tspecify the maximum number of opened connections\n"
507 "\t-V|--version\t\toutput the version and exit\n"
508 "\t-n|--nodaemon\t\tdo not daemonize main process\n"
509 "\t-d|--dont-fork\t\tdo not fork (implies --nodaemon)\n\n"
510 "\tif port is set to 0, stdin is used (for running from inetd).\n"
511 "\tif file_to_export contains '%%s', it is substituted with the IP\n"
512 "\t\taddress of the machine trying to connect\n"
513 "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
514 printf("Using configuration file %s\n", CFILE);
515 printf("For help, or when encountering bugs, please contact %s\n", PACKAGE_BUGREPORT);
518 /* Dumps a config file section of the given SERVER*, and exits. */
519 void dump_section(SERVER* serve, gchar* section_header) {
520 printf("[%s]\n", section_header);
521 printf("\texportname = %s\n", serve->exportname);
522 printf("\tlistenaddr = %s\n", serve->listenaddr);
523 if(serve->flags & F_READONLY) {
524 printf("\treadonly = true\n");
526 if(serve->flags & F_MULTIFILE) {
527 printf("\tmultifile = true\n");
529 if(serve->flags & F_TREEFILES) {
530 printf("\ttreefiles = true\n");
532 if(serve->flags & F_COPYONWRITE) {
533 printf("\tcopyonwrite = true\n");
535 if(serve->expected_size) {
536 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
538 if(serve->authname) {
539 printf("\tauthfile = %s\n", serve->authname);
541 exit(EXIT_SUCCESS);
545 * Parse the command line.
547 * @param argc the argc argument to main()
548 * @param argv the argv argument to main()
550 SERVER* cmdline(int argc, char *argv[], struct generic_conf *genconf) {
551 int i=0;
552 int nonspecial=0;
553 int c;
554 struct option long_options[] = {
555 {"read-only", no_argument, NULL, 'r'},
556 {"multi-file", no_argument, NULL, 'm'},
557 {"copy-on-write", no_argument, NULL, 'c'},
558 {"nodaemon", no_argument, NULL, 'n'},
559 {"dont-fork", no_argument, NULL, 'd'},
560 {"authorize-file", required_argument, NULL, 'l'},
561 {"config-file", required_argument, NULL, 'C'},
562 {"pid-file", required_argument, NULL, 'p'},
563 {"output-config", required_argument, NULL, 'o'},
564 {"max-connection", required_argument, NULL, 'M'},
565 {"version", no_argument, NULL, 'V'},
566 {0,0,0,0}
568 SERVER *serve;
569 off_t es;
570 size_t last;
571 char suffix;
572 bool do_output=false;
573 gchar* section_header="";
574 gchar** addr_port;
576 if(argc==1) {
577 return NULL;
579 serve=serve_inc_ref((SERVER*)g_new0(SERVER, 1));
580 serve->authname = g_strdup(default_authname);
581 serve->virtstyle=VIRT_IPLIT;
582 while((c=getopt_long(argc, argv, "-C:cwndl:mo:rp:M:V", long_options, &i))>=0) {
583 switch (c) {
584 case 1:
585 /* non-option argument */
586 switch(nonspecial++) {
587 case 0:
588 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
589 addr_port=g_strsplit(optarg, ":", 2);
591 /* Check for "@" - maybe user using this separator
592 for IPv4 address */
593 if(!addr_port[1]) {
594 g_strfreev(addr_port);
595 addr_port=g_strsplit(optarg, "@", 2);
597 } else {
598 addr_port=g_strsplit(optarg, "@", 2);
601 if(addr_port[1]) {
602 genconf->modernport=g_strdup(addr_port[1]);
603 genconf->modernaddr=g_strdup(addr_port[0]);
604 } else {
605 g_free(genconf->modernaddr);
606 genconf->modernaddr=NULL;
607 genconf->modernport=g_strdup(addr_port[0]);
609 g_strfreev(addr_port);
610 break;
611 case 1:
612 serve->exportname = g_strdup(optarg);
613 if(serve->exportname[0] != '/') {
614 fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
615 exit(EXIT_FAILURE);
617 break;
618 case 2:
619 last=strlen(optarg)-1;
620 suffix=optarg[last];
621 if (suffix == 'k' || suffix == 'K' ||
622 suffix == 'm' || suffix == 'M')
623 optarg[last] = '\0';
624 es = (off_t)atoll(optarg);
625 switch (suffix) {
626 case 'm':
627 case 'M': es <<= 10;
628 case 'k':
629 case 'K': es <<= 10;
630 default : break;
632 serve->expected_size = es;
633 break;
635 break;
636 case 'r':
637 serve->flags |= F_READONLY;
638 break;
639 case 'm':
640 serve->flags |= F_MULTIFILE;
641 break;
642 case 'o':
643 do_output = true;
644 section_header = g_strdup(optarg);
645 break;
646 case 'p':
647 strncpy(pidfname, optarg, 256);
648 pidfname[255]='\0';
649 break;
650 case 'c':
651 serve->flags |=F_COPYONWRITE;
652 break;
653 case 'n':
654 nodaemon = 1;
655 break;
656 case 'd':
657 dontfork = 1;
658 nodaemon = 1;
659 break;
660 case 'C':
661 g_free(config_file_pos);
662 config_file_pos=g_strdup(optarg);
663 break;
664 case 'l':
665 g_free(serve->authname);
666 serve->authname=g_strdup(optarg);
667 break;
668 case 'M':
669 serve->max_connections = strtol(optarg, NULL, 0);
670 break;
671 case 'V':
672 printf("This is nbd-server version " VERSION "\n");
673 exit(EXIT_SUCCESS);
674 break;
675 default:
676 usage();
677 exit(EXIT_FAILURE);
678 break;
681 /* What's left: the port to export, the name of the to be exported
682 * file, and, optionally, the size of the file, in that order. */
683 if(nonspecial<2) {
684 serve=serve_dec_ref(serve);
685 } else {
686 serve->servename = "";
688 if(do_output) {
689 if(!serve) {
690 g_critical("Need a complete configuration on the command line to output a config file section!");
691 exit(EXIT_FAILURE);
693 dump_section(serve, section_header);
695 return serve;
698 /* forward definition of parse_cfile */
699 GArray* parse_cfile(gchar* f, struct generic_conf *genconf, bool expect_generic, GError** e);
701 #ifdef HAVE_STRUCT_DIRENT_D_TYPE
702 #define NBD_D_TYPE de->d_type
703 #else
704 #define NBD_D_TYPE 0
705 #define DT_UNKNOWN 0
706 #define DT_REG 1
707 #endif
710 * Parse config file snippets in a directory. Uses readdir() and friends
711 * to find files and open them, then passes them on to parse_cfile
712 * with have_global set false
714 GArray* do_cfile_dir(gchar* dir, struct generic_conf *const genconf, GError** e) {
715 DIR* dirh = opendir(dir);
716 struct dirent* de;
717 gchar* fname;
718 GArray* retval = NULL;
719 GArray* tmp;
720 struct stat stbuf;
722 if(!dirh) {
723 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_DIR_UNKNOWN, "Invalid directory specified: %s", strerror(errno));
724 return NULL;
726 errno=0;
727 while((de = readdir(dirh))) {
728 int saved_errno=errno;
729 fname = g_build_filename(dir, de->d_name, NULL);
730 switch(NBD_D_TYPE) {
731 case DT_UNKNOWN:
732 /* Filesystem doesn't return type of
733 * file through readdir, or struct dirent
734 * doesn't have d_type. Run stat() on the file
735 * instead */
736 if(stat(fname, &stbuf)) {
737 perror("stat");
738 goto err_out;
740 if (!S_ISREG(stbuf.st_mode)) {
741 goto next;
743 case DT_REG:
744 /* Skip unless the name ends with '.conf' */
745 if(strcmp((de->d_name + strlen(de->d_name) - 5), ".conf")) {
746 goto next;
748 tmp = parse_cfile(fname, genconf, false, e);
749 errno=saved_errno;
750 if(*e) {
751 goto err_out;
753 if(!retval)
754 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
755 retval = g_array_append_vals(retval, tmp->data, tmp->len);
756 g_array_free(tmp, TRUE);
757 default:
758 break;
760 next:
761 g_free(fname);
763 if(errno) {
764 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_READDIR_ERR, "Error trying to read directory: %s", strerror(errno));
765 err_out:
766 if(retval)
767 g_array_free(retval, TRUE);
768 retval = NULL;
770 if(dirh)
771 closedir(dirh);
772 return retval;
776 * To be called by GArray clearing function.
777 * @param server pointer to server element
779 static void serve_clear_element(SERVER **server) {
780 serve_dec_ref(*server);
784 * Parse the config file.
786 * @param f the name of the config file
788 * @param genconf a pointer to generic configuration which will get
789 * updated with parsed values. If NULL, then parsed generic
790 * configuration values are safely and silently discarded.
792 * @param e a GError. Error code can be any of the following:
793 * NBDS_ERR_CFILE_NOTFOUND, NBDS_ERR_CFILE_MISSING_GENERIC,
794 * NBDS_ERR_CFILE_VALUE_INVALID, NBDS_ERR_CFILE_VALUE_UNSUPPORTED
795 * or NBDS_ERR_CFILE_NO_EXPORTS. @see NBDS_ERRS.
797 * @param expect_generic if true, we expect a configuration file that
798 * contains a [generic] section. If false, we don't.
800 * @return a GArray of SERVER* pointers. If the config file is empty or does not
801 * exist, returns an empty GArray; if the config file contains an
802 * error, returns NULL, and e is set appropriately
804 GArray* parse_cfile(gchar* f, struct generic_conf *const genconf, bool expect_generic, GError** e) {
805 const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
806 const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
807 gchar* cfdir = NULL;
808 SERVER s;
809 gchar *virtstyle=NULL;
810 PARAM lp[] = {
811 { "exportname", TRUE, PARAM_STRING, &(s.exportname), 0 },
812 { "authfile", FALSE, PARAM_STRING, &(s.authname), 0 },
813 { "filesize", FALSE, PARAM_OFFT, &(s.expected_size), 0 },
814 { "virtstyle", FALSE, PARAM_STRING, &(virtstyle), 0 },
815 { "prerun", FALSE, PARAM_STRING, &(s.prerun), 0 },
816 { "postrun", FALSE, PARAM_STRING, &(s.postrun), 0 },
817 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog), 0 },
818 { "cowdir", FALSE, PARAM_STRING, &(s.cowdir), 0 },
819 { "readonly", FALSE, PARAM_BOOL, &(s.flags), F_READONLY },
820 { "multifile", FALSE, PARAM_BOOL, &(s.flags), F_MULTIFILE },
821 { "treefiles", FALSE, PARAM_BOOL, &(s.flags), F_TREEFILES },
822 { "copyonwrite", FALSE, PARAM_BOOL, &(s.flags), F_COPYONWRITE },
823 { "waitfile", FALSE, PARAM_BOOL, &(s.flags), F_WAIT },
824 { "sparse_cow", FALSE, PARAM_BOOL, &(s.flags), F_SPARSE },
825 { "sdp", FALSE, PARAM_BOOL, &(s.flags), F_SDP },
826 { "sync", FALSE, PARAM_BOOL, &(s.flags), F_SYNC },
827 { "flush", FALSE, PARAM_BOOL, &(s.flags), F_FLUSH },
828 { "fua", FALSE, PARAM_BOOL, &(s.flags), F_FUA },
829 { "rotational", FALSE, PARAM_BOOL, &(s.flags), F_ROTATIONAL },
830 { "temporary", FALSE, PARAM_BOOL, &(s.flags), F_TEMPORARY },
831 { "trim", FALSE, PARAM_BOOL, &(s.flags), F_TRIM },
832 { "datalog", FALSE, PARAM_BOOL, &(s.flags), F_DATALOG },
833 { "listenaddr", FALSE, PARAM_STRING, &(s.listenaddr), 0 },
834 { "maxconnections", FALSE, PARAM_INT, &(s.max_connections), 0 },
835 { "force_tls", FALSE, PARAM_BOOL, &(s.flags), F_FORCEDTLS },
836 { "splice", FALSE, PARAM_BOOL, &(s.flags), F_SPLICE},
838 const int lp_size=sizeof(lp)/sizeof(PARAM);
839 struct generic_conf genconftmp;
840 PARAM gp[] = {
841 { "user", FALSE, PARAM_STRING, &(genconftmp.user), 0 },
842 { "group", FALSE, PARAM_STRING, &(genconftmp.group), 0 },
843 { "oldstyle", FALSE, PARAM_BOOL, &(genconftmp.flags), F_OLDSTYLE }, // only left here so we can issue an appropriate error message when the option is used
844 { "listenaddr", FALSE, PARAM_STRING, &(genconftmp.modernaddr), 0 },
845 { "port", FALSE, PARAM_STRING, &(genconftmp.modernport), 0 },
846 { "includedir", FALSE, PARAM_STRING, &cfdir, 0 },
847 { "allowlist", FALSE, PARAM_BOOL, &(genconftmp.flags), F_LIST },
848 { "unixsock", FALSE, PARAM_STRING, &(genconftmp.unixsock), 0 },
849 { "duallisten", FALSE, PARAM_BOOL, &(genconftmp.flags), F_DUAL_LISTEN }, // Used to listen on both TCP and unix socket
850 { "max_threads", FALSE, PARAM_INT, &(genconftmp.threads), 0 },
851 { "force_tls", FALSE, PARAM_BOOL, &(genconftmp.flags), F_FORCEDTLS },
852 { "certfile", FALSE, PARAM_STRING, &(genconftmp.certfile), 0 },
853 { "keyfile", FALSE, PARAM_STRING, &(genconftmp.keyfile), 0 },
854 { "cacertfile", FALSE, PARAM_STRING, &(genconftmp.cacertfile), 0 },
855 { "tlsprio", FALSE, PARAM_STRING, &(genconftmp.tlsprio), 0 },
857 PARAM* p=gp;
858 int p_size=sizeof(gp)/sizeof(PARAM);
859 _cleanup_(g_key_file_freep) GKeyFile *cfile = NULL;
860 g_autoptr(GError) err = NULL;
861 const char *err_msg=NULL;
862 GArray *retval=NULL;
863 gchar **groups;
864 gboolean bval;
865 gint ival;
866 gint64 i64val;
867 gchar* sval;
868 _cleanup_g_free_ gchar* startgroup = NULL;
869 gint i;
870 gint j;
872 memset(&genconftmp, 0, sizeof(struct generic_conf));
874 genconftmp.tlsprio = "NORMAL:+VERS-TLS-ALL:-VERS-TLS1.0:+VERS-TLS1.1:%SERVER_PRECEDENCE";
876 if (genconf) {
877 /* Use the passed configuration values as defaults. The
878 * parsing algorithm below updates all parameter targets
879 * found from configuration files. */
880 memcpy(&genconftmp, genconf, sizeof(struct generic_conf));
883 cfile = g_key_file_new();
884 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
885 if(expect_generic) {
886 g_array_set_clear_func(retval, (GDestroyNotify)serve_clear_element);
888 if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
889 G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
890 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NOTFOUND, "Could not open config file %s: %s",
891 f, err->message);
892 return retval;
894 startgroup = g_key_file_get_start_group(cfile);
895 if((!startgroup || strcmp(startgroup, "generic")) && expect_generic) {
896 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
897 return NULL;
899 groups = g_key_file_get_groups(cfile, NULL);
900 for(i=0;groups[i];i++) {
901 memset(&s, '\0', sizeof(SERVER));
903 /* After the [generic] group or when we're parsing an include
904 * directory, start parsing exports */
905 if(i==1 || !expect_generic) {
906 p=lp;
907 p_size=lp_size;
909 for(j=0;j<p_size;j++) {
910 assert(p[j].target != NULL);
911 assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL||p[j].ptype==PARAM_INT64);
912 switch(p[j].ptype) {
913 case PARAM_INT:
914 ival = g_key_file_get_integer(cfile,
915 groups[i],
916 p[j].paramname,
917 &err);
918 if(!err) {
919 *((gint*)p[j].target) = ival;
921 break;
922 case PARAM_INT64:
923 i64val = g_key_file_get_int64(cfile,
924 groups[i],
925 p[j].paramname,
926 &err);
927 if(!err) {
928 *((gint64*)p[j].target) = i64val;
930 break;
931 case PARAM_STRING:
932 sval = g_key_file_get_string(cfile,
933 groups[i],
934 p[j].paramname,
935 &err);
936 if(!err) {
937 *((gchar**)p[j].target) = sval;
939 break;
940 case PARAM_BOOL:
941 bval = g_key_file_get_boolean(cfile,
942 groups[i],
943 p[j].paramname, &err);
944 if(!err) {
945 if(bval) {
946 *((gint*)p[j].target) |= p[j].flagval;
947 } else {
948 *((gint*)p[j].target) &= ~(p[j].flagval);
951 break;
953 if(err) {
954 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
955 if(!p[j].required) {
956 /* Ignore not-found error for optional values */
957 g_clear_error(&err);
958 continue;
959 } else {
960 err_msg = MISSING_REQUIRED_ERROR;
962 } else {
963 err_msg = DEFAULT_ERROR;
965 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
966 g_array_free(retval, TRUE);
967 return NULL;
970 if(virtstyle) {
971 if(!strncmp(virtstyle, "none", 4)) {
972 s.virtstyle=VIRT_NONE;
973 } else if(!strncmp(virtstyle, "ipliteral", 9)) {
974 s.virtstyle=VIRT_IPLIT;
975 } else if(!strncmp(virtstyle, "iphash", 6)) {
976 s.virtstyle=VIRT_IPHASH;
977 } else if(!strncmp(virtstyle, "cidrhash", 8)) {
978 s.virtstyle=VIRT_CIDR;
979 if(strlen(virtstyle)<10) {
980 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
981 g_array_free(retval, TRUE);
982 return NULL;
984 s.cidrlen=strtol(virtstyle+8, NULL, 0);
985 } else {
986 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
987 g_array_free(retval, TRUE);
988 return NULL;
990 } else {
991 s.virtstyle=VIRT_IPLIT;
993 if(genconftmp.flags & F_OLDSTYLE) {
994 g_message("Since 3.10, the oldstyle protocol is no longer supported. Please migrate to the newstyle protocol.");
995 g_message("Exiting.");
996 return NULL;
998 #ifndef HAVE_SPLICE
999 if (s.flags & F_SPLICE) {
1000 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without splice support, yet group %s uses it", groups[i]);
1001 g_array_free(retval, TRUE);
1002 return NULL;
1004 #endif
1005 /* We can't mix copyonwrite and splice. */
1006 if ((s.flags & F_COPYONWRITE) && (s.flags & F_SPLICE)) {
1007 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1008 "Cannot mix copyonwrite with splice for an export in group %s",
1009 groups[i]);
1010 g_array_free(retval, TRUE);
1011 return NULL;
1013 if ((s.flags & F_COPYONWRITE) && (s.flags & F_WAIT)) {
1014 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_WAIT,
1015 "Cannot mix copyonwrite with waitfile for an export in group %s",
1016 groups[i]);
1017 g_array_free(retval, TRUE);
1018 return NULL;
1020 /* We can't mix datalog and splice. */
1021 if ((s.flags & F_DATALOG) && (s.flags & F_SPLICE)) {
1022 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1023 "Cannot mix datalog with splice for an export in group %s",
1024 groups[i]);
1025 g_array_free(retval, TRUE);
1026 return NULL;
1028 /* Don't need to free this, it's not our string */
1029 virtstyle=NULL;
1030 /* Don't append values for the [generic] group */
1031 if(i>0 || !expect_generic) {
1032 s.servename = groups[i];
1034 SERVER *srv = serve_inc_ref(g_memdup2(&s, sizeof(SERVER)));
1035 g_array_append_val(retval, srv);
1037 #ifndef WITH_SDP
1038 if(s.flags & F_SDP) {
1039 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
1040 g_array_free(retval, TRUE);
1041 return NULL;
1043 #endif
1045 if(cfdir) {
1046 GArray* extra = do_cfile_dir(cfdir, &genconftmp, e);
1047 if(extra) {
1048 retval = g_array_append_vals(retval, extra->data, extra->len);
1049 i+=extra->len;
1050 g_array_free(extra, TRUE);
1051 } else {
1052 if(*e) {
1053 g_array_free(retval, TRUE);
1054 return NULL;
1058 if(i==1 && expect_generic) {
1059 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NO_EXPORTS, "The config file does not specify any exports");
1062 if (genconf) {
1063 /* Return the updated generic configuration through the
1064 * pointer parameter. */
1065 memcpy(genconf, &genconftmp, sizeof(struct generic_conf));
1068 return retval;
1072 * Handle SIGCHLD by setting atomically a flag which will be evaluated in the
1073 * main loop of the root server process. This allows us to separate the signal
1074 * catching from th actual task triggered by SIGCHLD and hence processing in the
1075 * interrupt context is kept as minimial as possible.
1077 * @param s the signal we're handling (must be SIGCHLD, or something
1078 * is severely wrong)
1080 static void sigchld_handler(const int s G_GNUC_UNUSED) {
1081 is_sigchld_caught = 1;
1085 * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
1087 * @param key the key
1088 * @param value the value corresponding to the above key
1089 * @param user_data a pointer which we always set to 1, so that we know what
1090 * will happen next.
1092 void killchild(gpointer key, gpointer value, gpointer user_data) {
1093 pid_t *pid=value;
1095 kill(*pid, SIGTERM);
1099 * Handle SIGTERM by setting atomically a flag which will be evaluated in the
1100 * main loop of the root server process. This allows us to separate the signal
1101 * catching from th actual task triggered by SIGTERM and hence processing in the
1102 * interrupt context is kept as minimial as possible.
1104 * @param s the signal we're handling (must be SIGTERM, or something
1105 * is severely wrong).
1107 static void sigterm_handler(const int s G_GNUC_UNUSED) {
1108 is_sigterm_caught = 1;
1112 * Handle SIGHUP by setting atomically a flag which will be evaluated in
1113 * the main loop of the root server process. This allows us to separate
1114 * the signal catching from th actual task triggered by SIGHUP and hence
1115 * processing in the interrupt context is kept as minimial as possible.
1117 * @param s the signal we're handling (must be SIGHUP, or something
1118 * is severely wrong).
1120 static void sighup_handler(const int s G_GNUC_UNUSED) {
1121 is_sighup_caught = 1;
1124 static void sigusr1_handler(const int s G_GNUC_UNUSED) {
1125 msg(LOG_INFO, "Got SIGUSR1");
1126 sem_post(&file_wait_sem);
1130 * Get the file handle and offset, given an export offset.
1132 * @param client The client we're serving for
1133 * @param a The offset to get corresponding file/offset for
1134 * @param fhandle [out] File descriptor
1135 * @param foffset [out] Offset into fhandle
1136 * @param maxbytes [out] Tells how many bytes can be read/written
1137 * from fhandle starting at foffset (0 if there is no limit)
1138 * @return 0 on success, -1 on failure
1140 int get_filepos(CLIENT *client, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1141 GArray * const export = client->export;
1143 /* Negative offset not allowed */
1144 if(a < 0)
1145 return -1;
1147 /* Open separate file for treefiles */
1148 if (client->server->flags & F_TREEFILES) {
1149 *foffset = a % TREEPAGESIZE;
1150 *maxbytes = (( 1 + (a/TREEPAGESIZE) ) * TREEPAGESIZE) - a; // start position of next block
1151 *fhandle = open_treefile(client->exportname, ((client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR), client->exportsize,a, &client->lock);
1152 return 0;
1155 /* Binary search for last file with starting offset <= a */
1156 FILE_INFO fi;
1157 int start = 0;
1158 int end = export->len - 1;
1159 while( start <= end ) {
1160 int mid = (start + end) / 2;
1161 fi = g_array_index(export, FILE_INFO, mid);
1162 if( fi.startoff < a ) {
1163 start = mid + 1;
1164 } else if( fi.startoff > a ) {
1165 end = mid - 1;
1166 } else {
1167 start = end = mid;
1168 break;
1172 /* end should never go negative, since first startoff is 0 and a >= 0 */
1173 assert(end >= 0);
1175 fi = g_array_index(export, FILE_INFO, end);
1176 *fhandle = fi.fhandle;
1177 *foffset = a - fi.startoff;
1178 *maxbytes = 0;
1179 if( end+1 < export->len ) {
1180 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1181 *maxbytes = fi_next.startoff - a;
1184 return 0;
1188 * Write an amount of bytes at a given offset to the right file. This
1189 * abstracts the write-side of the multiple file option.
1191 * @param a The offset where the write should start
1192 * @param buf The buffer to write from
1193 * @param len The length of buf
1194 * @param client The client we're serving for
1195 * @param fua Flag to indicate 'Force Unit Access'
1196 * @return The number of bytes actually written, or -1 in case of an error
1198 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1199 int fhandle;
1200 off_t foffset;
1201 size_t maxbytes;
1202 ssize_t retval;
1204 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1205 return -1;
1206 if(maxbytes && len > maxbytes)
1207 len = maxbytes;
1209 DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1211 retval = pwrite(fhandle, buf, len, foffset);
1212 if(client->server->flags & F_SYNC) {
1213 fsync(fhandle);
1214 } else if (fua) {
1216 /* This is where we would do the following
1217 * #ifdef USE_SYNC_FILE_RANGE
1218 * However, we don't, for the reasons set out below
1219 * by Christoph Hellwig <hch@infradead.org>
1221 * [BEGINS]
1222 * fdatasync is equivalent to fsync except that it does not flush
1223 * non-essential metadata (basically just timestamps in practice), but it
1224 * does flush metadata requried to find the data again, e.g. allocation
1225 * information and extent maps. sync_file_range does nothing but flush
1226 * out pagecache content - it means you basically won't get your data
1227 * back in case of a crash if you either:
1229 * a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1230 * b) are using a sparse file on a filesystem
1231 * c) are using a fallocate-preallocated file on a filesystem
1232 * d) use any file on a COW filesystem like btrfs
1234 * e.g. it only does anything useful for you if you do not have a volatile
1235 * write cache, and either use a raw block device node, or just overwrite
1236 * an already fully allocated (and not preallocated) file on a non-COW
1237 * filesystem.
1238 * [ENDS]
1240 * What we should do is open a second FD with O_DSYNC set, then write to
1241 * that when appropriate. However, with a Linux client, every REQ_FUA
1242 * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1243 * problems.
1246 #if 0
1247 sync_file_range(fhandle, foffset, len,
1248 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1249 SYNC_FILE_RANGE_WAIT_AFTER);
1250 #else
1251 fdatasync(fhandle);
1252 #endif
1254 /* close file pointer in case of treefiles */
1255 if (client->server->flags & F_TREEFILES) {
1256 close(fhandle);
1258 return retval;
1262 * Call rawexpwrite repeatedly until all data has been written.
1264 * @param a The offset where the write should start
1265 * @param buf The buffer to write from
1266 * @param len The length of buf
1267 * @param client The client we're serving for
1268 * @param fua Flag to indicate 'Force Unit Access'
1269 * @return 0 on success, nonzero on failure
1271 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1272 ssize_t ret=0;
1274 while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1275 a += ret;
1276 buf += ret;
1277 len -= ret;
1279 return (ret < 0 || len != 0);
1282 static void setup_reply(struct nbd_reply* rep, struct nbd_request* req) {
1283 rep->magic = htonl(NBD_REPLY_MAGIC);
1284 rep->error = 0;
1285 rep->cookie = req->cookie;
1288 static void log_reply(CLIENT *client, struct nbd_reply *prply) {
1289 if (client->transactionlogfd != -1) {
1290 lock_logsem(client);
1291 writeit(client->transactionlogfd, prply, sizeof(*prply));
1292 unlock_logsem(client);
1296 static void log_structured_reply(CLIENT *client, struct nbd_structured_reply *prply) {
1297 if (client->transactionlogfd != -1) {
1298 lock_logsem(client);
1299 writeit(client->transactionlogfd, prply, sizeof(*prply));
1300 unlock_logsem(client);
1304 void send_structured_chunk(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, void *buf[], size_t buflen[]) {
1305 struct nbd_structured_reply rep;
1306 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1307 rep.flags = htons(flags);
1308 rep.type = htons(type);
1309 rep.cookie = req->cookie;
1310 rep.paylen = htonl(length);
1311 pthread_mutex_lock(&(client->lock));
1312 socket_write(client, &rep, sizeof rep);
1313 for(int i=0; i<bufcount; i++) {
1314 socket_write(client, buf[i], buflen[i]);
1316 pthread_mutex_unlock(&(client->lock));
1317 log_structured_reply(client, &rep);
1320 void send_structured_chunk_v(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, ...) {
1321 struct nbd_structured_reply rep;
1322 va_list ap;
1323 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1324 rep.flags = htons(flags);
1325 rep.type = htons(type);
1326 rep.cookie = req->cookie;
1327 rep.paylen = htonl(length);
1328 va_start(ap, bufcount);
1329 pthread_mutex_lock(&(client->lock));
1330 socket_write(client, &rep, sizeof rep);
1331 for(int i=0; i<bufcount; i++) {
1332 void *buf = va_arg(ap, void*);
1333 size_t size = va_arg(ap, size_t);
1334 socket_write(client, buf, size);
1336 pthread_mutex_unlock(&(client->lock));
1337 log_structured_reply(client, &rep);
1338 va_end(ap);
1342 * Find the location to write the data for the next chunk to.
1343 * Assumes checks on memory sizes etc have already been done.
1345 * @param ctx the context we're working with
1346 * @param offset the offset into the request
1347 * @param len the length of this chunk.
1349 char * find_read_buf(READ_CTX *ctx) {
1350 if(!(ctx->is_structured) || ctx->df) {
1351 return ctx->buf + ctx->current_offset;
1353 ctx->buf = malloc(ctx->current_len);
1354 if(!(ctx->buf)) {
1355 err("Could not allocate memory for request");
1357 return ctx->buf;
1360 void confirm_read(CLIENT *client, READ_CTX *ctx, size_t len_read) {
1361 if(ctx->is_structured && !(ctx->df)) {
1362 uint64_t offset = htonll(ctx->req->from + (uint64_t)(ctx->current_offset));
1363 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len_read + 8, 2, &offset, sizeof offset, ctx->buf, (size_t)len_read);
1364 free(ctx->buf);
1368 void complete_read(CLIENT *client, READ_CTX *ctx, uint32_t error, char *errmsg, uint16_t msglen, bool with_offset, uint64_t err_offset) {
1369 uint16_t type;
1370 uint64_t offset = 0;
1371 if(ctx->is_structured) {
1372 if(ctx->df) {
1373 uint32_t len = ctx->req->len;
1374 if(error != 0 && with_offset) {
1375 len = err_offset;
1377 if(error == 0 || with_offset) {
1378 offset = htonll(ctx->req->from);
1379 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len + 8, 2, &offset, sizeof offset, ctx->buf, err_offset);
1381 free(ctx->buf);
1383 if(error != 0) {
1384 struct nbd_structured_error_payload pl;
1385 void *buf[3];
1386 size_t bufsize[3];
1387 int payloads = 1;
1388 size_t total_size;
1389 pl.error = error;
1390 pl.msglen = msglen;
1391 if(with_offset) {
1392 offset += err_offset;
1393 type = NBD_REPLY_TYPE_ERROR_OFFSET;
1394 } else {
1395 type = NBD_REPLY_TYPE_ERROR;
1397 buf[0] = &pl;
1398 bufsize[0] = sizeof pl;
1399 total_size = bufsize[0];
1400 if(msglen > 0) {
1401 buf[payloads] = errmsg;
1402 bufsize[payloads++] = msglen;
1403 total_size += msglen;
1405 if(with_offset) {
1406 buf[payloads] = &offset;
1407 bufsize[payloads++] = sizeof offset;
1408 total_size += sizeof offset;
1410 send_structured_chunk(client, ctx->req, NBD_REPLY_FLAG_DONE, type, total_size, payloads, buf, bufsize);
1411 return;
1413 send_structured_chunk_v(client, ctx->req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, 0, 0);
1414 } else {
1415 struct nbd_reply rep;
1416 setup_reply(&rep, ctx->req);
1417 if(error) {
1418 rep.error = error;
1420 log_reply(client, &rep);
1421 pthread_mutex_lock(&(client->lock));
1422 socket_write(client, &rep, sizeof rep);
1423 if(!error) {
1424 socket_write(client, ctx->buf, ctx->buflen);
1426 pthread_mutex_unlock(&(client->lock));
1427 free(ctx->buf);
1432 * Read an amount of bytes at a given offset from the right file. This
1433 * abstracts the read-side of the multiple files option.
1435 * @param a The offset where the read should start
1436 * @param buf A buffer to read into
1437 * @param len The size of buf
1438 * @param client The client we're serving for
1439 * @return The number of bytes actually read, or -1 in case of an
1440 * error.
1442 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1443 int fhandle;
1444 off_t foffset;
1445 size_t maxbytes;
1446 ssize_t retval;
1448 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1449 return -1;
1450 if(maxbytes && len > maxbytes)
1451 len = maxbytes;
1453 DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1455 retval = pread(fhandle, buf, len, foffset);
1456 if (client->server->flags & F_TREEFILES) {
1457 close(fhandle);
1459 return retval;
1463 * Call rawexpread repeatedly until all data has been read.
1464 * @return 0 on success, nonzero on failure
1466 int rawexpread_fully(READ_CTX *ctx, CLIENT *client) {
1467 ssize_t ret=0;
1469 char *buf;
1471 while(ctx->current_len > 0) {
1472 buf = find_read_buf(ctx);
1473 if((ret = rawexpread((off_t)ctx->req->from + (off_t)ctx->current_offset, buf, ctx->current_len, client)) <= 0) {
1474 break;
1476 confirm_read(client, ctx, ret);
1477 ctx->current_offset += ret;
1478 ctx->current_len -= ret;
1480 return (ret < 0 || ctx->current_len != 0);
1483 #ifdef HAVE_SPLICE
1484 int rawexpsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir,
1485 int fua)
1487 int fhandle;
1488 off_t foffset;
1489 size_t maxbytes;
1490 ssize_t retval;
1492 if (get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1493 return -1;
1494 if (maxbytes && len > maxbytes)
1495 len = maxbytes;
1497 DEBUG("(SPLICE %s fd %d offset %llu len %u), ",
1498 (dir == SPLICE_IN) ? "from" : "to", fhandle,
1499 (unsigned long long)a, (unsigned)len);
1502 * SPLICE_F_MOVE doesn't actually work at the moment, but in the future
1503 * it might, so go ahead and use it.
1505 if (dir == SPLICE_IN) {
1506 retval = splice(fhandle, &foffset, pipe, NULL, len,
1507 SPLICE_F_MOVE);
1508 } else {
1509 retval = splice(pipe, NULL, fhandle, &foffset, len,
1510 SPLICE_F_MOVE);
1511 if (client->server->flags & F_SYNC)
1512 fsync(fhandle);
1513 else if (fua)
1514 fdatasync(fhandle);
1516 if (client->server->flags & F_TREEFILES)
1517 close(fhandle);
1518 return retval;
1522 * Splice an amount of bytes from the given offset from/into the right file
1523 * from/into the given pipe.
1524 * @param pipe The pipe we are using for this splice.
1525 * @param a The offset of the file we are operating on.
1526 * @param len The length of the splice.
1527 * @param client The client we're splicing for.
1528 * @param dir The direction we are doing the splice in.
1529 * @param fua Set if this is a write and we need to fua.
1530 * @return 0 on success, nonzero on failure.
1532 int expsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir, int fua)
1534 ssize_t ret = 0;
1536 while (len > 0 &&
1537 (ret = rawexpsplice(pipe, a, len, client, dir, fua)) > 0) {
1538 a += ret;
1539 len -= ret;
1541 return (ret < 0 || len != 0);
1543 #endif /* HAVE_SPLICE */
1546 * Read an amount of bytes at a given offset from the right file. This
1547 * abstracts the read-side of the copyonwrite stuff, and calls
1548 * rawexpread() with the right parameters to do the actual work.
1549 * @param a The offset where the read should start
1550 * @param buf A buffer to read into
1551 * @param len The size of buf
1552 * @param client The client we're going to read for
1553 * @return 0 on success, nonzero on failure
1555 int expread(READ_CTX *ctx, CLIENT *client) {
1556 off_t rdlen, offset;
1557 off_t mapcnt, mapl, maph, pagestart;
1558 off_t a = (off_t)ctx->current_offset + (off_t)ctx->req->from;
1559 size_t len = (size_t) ctx->req->len;
1560 int rv = 0;
1562 DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1564 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1565 return(rawexpread_fully(ctx, client));
1567 mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1569 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1570 pagestart=mapcnt*DIFFPAGESIZE;
1571 offset=a-pagestart;
1572 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1573 len : (size_t)DIFFPAGESIZE-offset;
1574 if (!(client->server->flags & F_COPYONWRITE))
1575 pthread_rwlock_rdlock(&client->export_lock);
1576 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1577 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1578 (unsigned long)(client->difmap[mapcnt]));
1579 char *buf = find_read_buf(ctx);
1580 if (pread(client->difffile, buf, rdlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != rdlen) {
1581 goto fail;
1583 confirm_read(client, ctx, rdlen);
1584 } else { /* the block is not there */
1585 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1586 DEBUG("Page %llu is not here, and waiting for file\n",
1587 (unsigned long long)mapcnt);
1588 goto fail;
1589 } else {
1590 DEBUG("Page %llu is not here, we read the original one\n",
1591 (unsigned long long)mapcnt);
1592 ctx->current_len = rdlen;
1593 if(rawexpread_fully(ctx, client)) goto fail;
1596 if (!(client->server->flags & F_COPYONWRITE))
1597 pthread_rwlock_unlock(&client->export_lock);
1598 len-=rdlen; a+=rdlen;
1600 rv = 0;
1601 goto end;
1602 fail:
1603 if (!(client->server->flags & F_COPYONWRITE))
1604 pthread_rwlock_unlock(&client->export_lock);
1605 rv = -1;
1606 end:
1607 return rv;
1611 * Write an amount of bytes at a given offset to the right file. This
1612 * abstracts the write-side of the copyonwrite option, and calls
1613 * rawexpwrite() with the right parameters to do the actual work.
1615 * @param a The offset where the write should start
1616 * @param buf The buffer to write from
1617 * @param len The length of buf
1618 * @param client The client we're going to write for.
1619 * @param fua Flag to indicate 'Force Unit Access'
1620 * @return 0 on success, nonzero on failure
1622 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1623 char pagebuf[DIFFPAGESIZE];
1624 off_t mapcnt,mapl,maph;
1625 off_t wrlen,rdlen;
1626 off_t pagestart;
1627 off_t offset;
1629 DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1632 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1633 return(rawexpwrite_fully(a, buf, len, client, fua));
1635 mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1637 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1638 pagestart=mapcnt*DIFFPAGESIZE ;
1639 offset=a-pagestart ;
1640 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1641 len : (size_t)DIFFPAGESIZE-offset;
1643 if (!(client->server->flags & F_COPYONWRITE))
1644 pthread_rwlock_rdlock(&client->export_lock);
1645 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1646 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1647 (unsigned long)(client->difmap[mapcnt])) ;
1648 if (pwrite(client->difffile, buf, wrlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != wrlen) goto fail;
1649 } else { /* the block is not there */
1650 client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1651 DEBUG("Page %llu is not here, we put it at %lu\n",
1652 (unsigned long long)mapcnt,
1653 (unsigned long)(client->difmap[mapcnt]));
1654 if ((offset != 0) || (wrlen != DIFFPAGESIZE)){
1655 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1656 DEBUG("error: we can write only whole page while waiting for file\n");
1657 goto fail;
1659 rdlen=DIFFPAGESIZE;
1660 int ret;
1661 char *ptr = pagebuf;
1662 while(rdlen > 0 && (ret = rawexpread(pagestart, ptr, rdlen, client)) > 0) {
1663 pagestart += ret;
1664 ptr += ret;
1665 rdlen -= ret;
1667 if(ret < 0 ) goto fail;
1669 memcpy(pagebuf+offset,buf,wrlen) ;
1670 if (write(client->difffile, pagebuf, DIFFPAGESIZE) != DIFFPAGESIZE)
1671 goto fail;
1673 if (!(client->server->flags & F_COPYONWRITE))
1674 pthread_rwlock_unlock(&client->export_lock);
1675 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1677 if (client->server->flags & F_SYNC) {
1678 fsync(client->difffile);
1679 } else if (fua) {
1680 /* open question: would it be cheaper to do multiple sync_file_ranges?
1681 as we iterate through the above?
1683 fdatasync(client->difffile);
1685 return 0;
1686 fail:
1687 if (!(client->server->flags & F_COPYONWRITE))
1688 pthread_rwlock_unlock(&client->export_lock);
1689 return -1;
1694 * Write an amount of zeroes at a given offset to the right file.
1695 * This routine could be optimised by not calling expwrite. However,
1696 * this is by far the simplest way to do it.
1698 * @param req the request
1699 * @param client The client we're going to write for.
1700 * @return 0 on success, nonzero on failure
1702 int expwrite_zeroes(struct nbd_request* req, CLIENT* client, int fua) {
1703 off_t a = req->from;
1704 size_t len = req->len;
1705 size_t maxsize = 64LL*1024LL*1024LL;
1706 /* use calloc() as sadly MAP_ANON is apparently not POSIX standard */
1707 char *buf = calloc (1, maxsize);
1708 int ret;
1709 while (len > 0) {
1710 size_t l = len;
1711 if (l > maxsize)
1712 l = maxsize;
1713 ret = expwrite(a, buf, l, client, fua);
1714 if (ret) {
1715 free(buf);
1716 return ret;
1718 len -= l;
1720 free(buf);
1721 return 0;
1725 * Flush data to a client
1727 * @param client The client we're going to write for.
1728 * @return 0 on success, nonzero on failure
1730 int expflush(CLIENT *client) {
1731 gint i;
1733 if (client->server->flags & F_COPYONWRITE) {
1734 return fsync(client->difffile);
1737 if (client->server->flags & F_WAIT) {
1738 return fsync(client->difffile);
1741 if (client->server->flags & F_TREEFILES ) {
1742 // all we can do is force sync the entire filesystem containing the tree
1743 if (client->server->flags & F_READONLY)
1744 return 0;
1745 sync();
1746 return 0;
1749 for (i = 0; i < client->export->len; i++) {
1750 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1751 if (fsync(fi.fhandle) < 0)
1752 return -1;
1755 return 0;
1758 void punch_hole(int fd, off_t off, off_t len) {
1759 DEBUG("Request to punch a hole in fd=%d, starting from %llu, length %llu\n", fd, (unsigned long long)off, (unsigned long long)len);
1760 errno = 0;
1761 // fallocate -- files, Linux
1762 #if HAVE_FALLOC_PH
1763 do {
1764 if(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len) == 0)
1765 return;
1766 } while(errno == EINTR);
1767 #endif
1768 // ioctl(BLKDISCARD) -- block devices, Linux
1769 #if HAVE_BLKDISCARD
1770 uint64_t range[2] = {off, len};
1771 do {
1772 if(ioctl(fd, BLKDISCARD, range) == 0)
1773 return;
1774 } while(errno == EINTR);
1775 #endif
1776 // Windows
1777 #if HAVE_FSCTL_SET_ZERO_DATA
1778 FILE_ZERO_DATA_INFORMATION zerodata;
1779 zerodata.FileOffset.QuadPart = off;
1780 zerodata.BeyondFinalZero.QuadPart = off + len;
1781 HANDLE w32handle = (HANDLE)_get_osfhandle(fd);
1782 DWORD bytesret;
1783 DeviceIoControl(w32handle, FSCTL_SET_ZERO_DATA, &zerodata, sizeof(zerodata), NULL, 0, &bytesret, NULL);
1784 return;
1785 #endif
1786 if(errno) {
1787 DEBUG("punching holes failed: %s", strerror(errno));
1788 } else {
1789 DEBUG("punching holes not supported on this platform\n");
1793 static void send_reply(CLIENT* client, uint32_t opt, uint32_t reply_type, ssize_t datasize, const void* data) {
1794 struct {
1795 uint64_t magic;
1796 uint32_t opt;
1797 uint32_t reply_type;
1798 uint32_t datasize;
1799 } __attribute__ ((packed)) header = {
1800 htonll(0x3e889045565a9LL),
1801 htonl(opt),
1802 htonl(reply_type),
1803 htonl(datasize),
1805 if(datasize < 0) {
1806 datasize = strlen((char*)data);
1807 header.datasize = htonl(datasize);
1809 socket_write(client, &header, sizeof(header));
1810 if(data != NULL) {
1811 socket_write(client, data, datasize);
1816 * Find the name of the file we have to serve. This will use g_strdup_printf
1817 * to put the IP address of the client inside a filename containing
1818 * "%s" (in the form as specified by the "virtstyle" option). That name
1819 * is then written to client->exportname.
1821 * @param net A socket connected to an nbd client
1822 * @param client information about the client. The IP address in human-readable
1823 * format will be written to a new char* buffer, the address of which will be
1824 * stored in client->clientname.
1825 * @return: 0 - OK, -1 - failed.
1827 int set_peername(int net, CLIENT *client) {
1828 struct sockaddr_storage netaddr;
1829 struct sockaddr* addr = (struct sockaddr*)&netaddr;
1830 socklen_t addrinlen = sizeof( struct sockaddr_storage );
1831 struct addrinfo hints;
1832 struct addrinfo *ai = NULL;
1833 char peername[NI_MAXHOST];
1834 char netname[NI_MAXHOST];
1835 char *tmp = NULL;
1836 int i;
1837 int e;
1839 if (getsockname(net, addr, &addrinlen) < 0) {
1840 msg(LOG_INFO, "getsockname failed: %m");
1841 return -1;
1844 if(netaddr.ss_family == AF_UNIX) {
1845 client->clientaddr.ss_family = AF_UNIX;
1846 strcpy(peername, "unix");
1847 } else {
1848 if (getpeername(net, (struct sockaddr *) &(client->clientaddr), &addrinlen) < 0) {
1849 msg(LOG_INFO, "getpeername failed: %m");
1850 return -1;
1852 if((e = getnameinfo((struct sockaddr *)&(client->clientaddr), addrinlen,
1853 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST))) {
1854 msg(LOG_INFO, "getnameinfo failed: %s", gai_strerror(e));
1855 return -1;
1858 memset(&hints, '\0', sizeof (hints));
1859 hints.ai_flags = AI_ADDRCONFIG;
1860 e = getaddrinfo(peername, NULL, &hints, &ai);
1862 if(e != 0) {
1863 msg(LOG_INFO, "getaddrinfo failed: %s", gai_strerror(e));
1864 freeaddrinfo(ai);
1865 return -1;
1869 if(strncmp(peername, "::ffff:", 7) == 0) {
1870 memmove(peername, peername+7, strlen(peername));
1873 switch(client->server->virtstyle) {
1874 case VIRT_NONE:
1875 msg(LOG_DEBUG, "virtualization is off");
1876 client->exportname=g_strdup(client->server->exportname);
1877 break;
1878 case VIRT_IPHASH:
1879 msg(LOG_DEBUG, "virtstyle iphash");
1880 for(i=0;i<strlen(peername);i++) {
1881 if(peername[i]=='.') {
1882 peername[i]='/';
1885 break;
1886 case VIRT_IPLIT:
1887 msg(LOG_DEBUG, "virtstyle ipliteral");
1888 client->exportname=g_strdup_printf(client->server->exportname, peername);
1889 break;
1890 case VIRT_CIDR:
1891 msg(LOG_DEBUG, "virtstyle cidr %d", client->server->cidrlen);
1892 memcpy(&netaddr, &(client->clientaddr), addrinlen);
1893 int addrbits;
1894 if(client->clientaddr.ss_family == AF_UNIX) {
1895 tmp = g_strdup(peername);
1896 } else {
1897 assert((ai->ai_family == AF_INET) || (ai->ai_family == AF_INET6));
1898 if(ai->ai_family == AF_INET) {
1899 addrbits = 32;
1900 } else if(ai->ai_family == AF_INET6) {
1901 addrbits = 128;
1902 } else {
1903 g_assert_not_reached();
1905 uint8_t* addrptr = (uint8_t*)(((struct sockaddr*)&netaddr)->sa_data);
1906 for(int i = 0; i < addrbits; i+=8) {
1907 int masklen = client->server->cidrlen - i;
1908 masklen = masklen > 0 ? masklen : 0;
1909 uint8_t mask = getmaskbyte(masklen);
1910 *addrptr &= mask;
1911 addrptr++;
1913 getnameinfo((struct sockaddr *) &netaddr, addrinlen,
1914 netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1915 tmp=g_strdup_printf("%s/%s", netname, peername);
1918 if(tmp != NULL) {
1919 client->exportname=g_strdup_printf(client->server->exportname, tmp);
1920 g_free(tmp);
1923 break;
1926 if(ai) {
1927 freeaddrinfo(ai);
1929 msg(LOG_INFO, "connect from %s, assigned file is %s",
1930 peername, client->exportname);
1931 client->clientname=g_strdup(peername);
1932 return 0;
1935 int commit_diff(CLIENT* client, bool lock, int fhandle){
1936 int dirtycount = 0;
1937 int pagecount = client->exportsize/DIFFPAGESIZE;
1938 off_t offset;
1939 char* buf = malloc(sizeof(char)*DIFFPAGESIZE);
1941 for (int i=0; i<pagecount; i++){
1942 offset = DIFFPAGESIZE*i;
1943 if (lock)
1944 pthread_rwlock_wrlock(&client->export_lock);
1945 if (client->difmap[i] != (u32)-1){
1946 dirtycount += 1;
1947 DEBUG("flushing dirty page %d, offset %ld\n", i, offset);
1948 if (pread(client->difffile, buf, DIFFPAGESIZE, client->difmap[i]*DIFFPAGESIZE) != DIFFPAGESIZE) {
1949 msg(LOG_WARNING, "could not read while committing diff: %m");
1950 if(lock) {
1951 pthread_rwlock_unlock(&client->export_lock);
1953 break;
1955 if (pwrite(fhandle, buf, DIFFPAGESIZE, offset) != DIFFPAGESIZE) {
1956 msg(LOG_WARNING, "could not write while committing diff: %m");
1957 if (lock) {
1958 pthread_rwlock_unlock(&client->export_lock);
1960 break;
1962 client->difmap[i] = (u32)-1;
1964 if (lock)
1965 pthread_rwlock_unlock(&client->export_lock);
1968 free(buf);
1969 return dirtycount;
1972 void* wait_file(void *void_ptr) {
1973 CLIENT* client = (CLIENT *)void_ptr;
1974 FILE_INFO fi;
1975 GArray* export;
1976 mode_t mode = O_RDWR;
1977 int dirtycount;
1979 fi.fhandle = -1;
1980 fi.startoff = 0;
1982 while (fi.fhandle < 1){
1983 sem_wait(&file_wait_sem);
1984 msg(LOG_INFO, "checking for file %s", client->server->exportname);
1985 fi.fhandle = open(client->server->exportname, mode);
1988 msg(LOG_INFO, "File %s appeared, fd %d", client->server->exportname, fi.fhandle);
1990 // first time there may be lot of data so we lock only per page
1991 do {
1992 dirtycount = commit_diff(client, true, fi.fhandle);
1993 } while (dirtycount > 0);
1995 //last time we lock export for the whole time until we switch write destination
1996 pthread_rwlock_wrlock(&client->export_lock);
1997 do {
1998 dirtycount = commit_diff(client, false, fi.fhandle);
1999 } while (dirtycount > 0);
2001 export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2002 g_array_append_val(export, fi);
2004 client->export = export;
2005 pthread_rwlock_unlock(&client->export_lock);
2006 msg(LOG_INFO, "Waiting for file ended, switching to exported file %s", client->server->exportname);
2008 return NULL;
2012 * Set up client export array, which is an array of FILE_INFO.
2013 * Also, split a single exportfile into multiple ones, if that was asked.
2014 * @param client information on the client which we want to setup export for
2016 bool setupexport(CLIENT* client) {
2017 int i = 0;
2018 off_t laststartoff = 0, lastsize = 0;
2019 int multifile = (client->server->flags & F_MULTIFILE);
2020 int treefile = (client->server->flags & F_TREEFILES);
2021 int temporary = (client->server->flags & F_TEMPORARY) && !multifile;
2022 int cancreate = (client->server->expected_size) && !multifile;
2024 if (treefile || (client->server->flags & F_WAIT)) {
2025 client->export = NULL; // this could be thousands of files so we open handles on demand although its slower
2026 client->exportsize = client->server->expected_size; // available space is not checked, as it could change during runtime anyway
2028 if(client->server->flags & F_WAIT){
2029 pthread_t wait_file_thread;
2030 if (pthread_create(&wait_file_thread, NULL, wait_file, client)){
2031 DEBUG("failed to create wait_file thread");
2032 return false;
2036 } else {
2037 client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2039 /* If multi-file, open as many files as we can.
2040 * If not, open exactly one file.
2041 * Calculate file sizes as we go to get total size. */
2042 for(i=0; ; i++) {
2043 FILE_INFO fi;
2044 _cleanup_g_free_ gchar *tmpname = NULL;
2045 _cleanup_g_free_ gchar* error_string = NULL;
2047 if (i)
2048 cancreate = 0;
2049 /* if expected_size is specified, and this is the first file, we can create the file */
2050 mode_t mode = (client->server->flags & F_READONLY) ?
2051 O_RDONLY : (O_RDWR | (cancreate?O_CREAT:0));
2053 if (temporary) {
2054 tmpname=g_strdup_printf("%s.%d-XXXXXX", client->exportname, i);
2055 DEBUG( "Opening %s\n", tmpname );
2056 fi.fhandle = mkstemp(tmpname);
2057 } else {
2058 if(multifile) {
2059 tmpname=g_strdup_printf("%s.%d", client->exportname, i);
2060 } else {
2061 tmpname=g_strdup(client->exportname);
2063 DEBUG( "Opening %s\n", tmpname );
2064 fi.fhandle = open(tmpname, mode, 0600);
2065 if(fi.fhandle == -1 && mode == O_RDWR) {
2066 /* Try again because maybe media was read-only */
2067 fi.fhandle = open(tmpname, O_RDONLY);
2068 if(fi.fhandle != -1) {
2069 /* Opening the base file in copyonwrite mode is
2070 * okay */
2071 if(!(client->server->flags & F_COPYONWRITE)) {
2072 client->server->flags |= F_AUTOREADONLY;
2073 client->server->flags |= F_READONLY;
2078 if(fi.fhandle == -1) {
2079 if(multifile && i>0)
2080 break;
2081 error_string=g_strdup_printf(
2082 "Could not open exported file %s: %%m",
2083 tmpname);
2084 err_nonfatal(error_string);
2085 return false;
2088 if (temporary) {
2089 unlink(tmpname); /* File will stick around whilst FD open */
2092 fi.startoff = laststartoff + lastsize;
2093 g_array_append_val(client->export, fi);
2095 /* Starting offset and size of this file will be used to
2096 * calculate starting offset of next file */
2097 laststartoff = fi.startoff;
2098 lastsize = size_autodetect(fi.fhandle);
2100 /* If we created the file, it will be length zero */
2101 if (!lastsize && cancreate) {
2102 assert(!multifile);
2103 if(ftruncate (fi.fhandle, client->server->expected_size)<0) {
2104 err_nonfatal("Could not expand file: %m");
2105 return false;
2107 lastsize = client->server->expected_size;
2108 break; /* don't look for any more files */
2111 if(!multifile || temporary)
2112 break;
2115 /* Set export size to total calculated size */
2116 client->exportsize = laststartoff + lastsize;
2118 /* Export size may be overridden */
2119 if(client->server->expected_size) {
2120 /* desired size must be <= total calculated size */
2121 if(client->server->expected_size > client->exportsize) {
2122 err_nonfatal("Size of exported file is too big\n");
2123 return false;
2126 client->exportsize = client->server->expected_size;
2130 msg(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
2131 if(multifile) {
2132 msg(LOG_INFO, "Total number of files: %d", i);
2134 if(treefile) {
2135 msg(LOG_INFO, "Total number of (potential) files: %" PRId64, (client->exportsize+TREEPAGESIZE-1)/TREEPAGESIZE);
2137 return true;
2140 bool copyonwrite_prepare(CLIENT* client) {
2141 off_t i;
2142 _cleanup_g_free_ gchar* dir = NULL;
2143 _cleanup_g_free_ gchar* export_base = NULL;
2144 if (client->server->cowdir != NULL) {
2145 dir = g_strdup(client->server->cowdir);
2146 } else {
2147 dir = g_strdup(dirname(client->exportname));
2149 export_base = g_strdup(basename(client->exportname));
2150 client->difffilename = g_strdup_printf("%s/%s-%s-%d.diff",dir,export_base,client->clientname,
2151 (int)getpid());
2152 msg(LOG_INFO, "About to create map and diff file %s", client->difffilename) ;
2153 client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
2154 if (client->difffile<0) {
2155 err("Could not create diff file (%m)");
2156 return false;
2158 if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL) {
2159 err("Could not allocate memory");
2160 return false;
2162 for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1;
2164 return true;
2167 void send_export_info(CLIENT* client, SERVER* server, bool maybe_zeroes) {
2168 uint64_t size_host = htonll((u64)(client->exportsize));
2169 uint16_t flags = NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_WRITE_ZEROES;
2171 socket_write(client, &size_host, 8);
2172 if (server->flags & F_READONLY)
2173 flags |= NBD_FLAG_READ_ONLY;
2174 if (server->flags & F_FLUSH)
2175 flags |= NBD_FLAG_SEND_FLUSH;
2176 if (server->flags & F_FUA)
2177 flags |= NBD_FLAG_SEND_FUA;
2178 if (server->flags & F_ROTATIONAL)
2179 flags |= NBD_FLAG_ROTATIONAL;
2180 if (server->flags & F_TRIM)
2181 flags |= NBD_FLAG_SEND_TRIM;
2182 if (!(server->flags & F_COPYONWRITE))
2183 flags |= NBD_FLAG_CAN_MULTI_CONN;
2184 if (client->clientflags & F_STRUCTURED)
2185 flags |= NBD_FLAG_SEND_DF;
2186 flags = htons(flags);
2187 socket_write(client, &flags, sizeof(flags));
2188 if (!(glob_flags & F_NO_ZEROES) && maybe_zeroes) {
2189 char zeros[128];
2190 memset(zeros, '\0', sizeof(zeros));
2191 socket_write(client, zeros, 124);
2196 * Setup the transaction log
2198 * The function does all things required for the transaction log:
2199 * - Create a new log file.
2200 * - allocate the posix semaphore for synchronization.
2201 * - Report if a log file already exists.
2202 * - If needed add a header to the log.
2204 * If something goes wrong, logging is disabled.
2206 * @param client the CLIENT structure with .server and .net members set
2207 * up correctly
2209 static void setup_transactionlog(CLIENT *client) {
2210 struct stat fdinfo;
2211 int ret;
2213 /* 1) create the file */
2214 if((client->transactionlogfd =
2215 open(client->server->transactionlog,
2216 O_WRONLY | O_CREAT,
2217 S_IRUSR | S_IWUSR)) ==
2218 -1) {
2219 msg(LOG_INFO, "Could not open transactionlog %s, moving on without it",
2220 client->server->transactionlog);
2221 return;
2224 /* 2) If needed, write flags */
2225 if (client->server->flags & F_DATALOG) {
2226 struct nbd_request req;
2227 int ret;
2229 req.magic = htonl(NBD_TRACELOG_MAGIC);
2230 req.type = htonl(NBD_TRACELOG_SET_DATALOG);
2231 req.cookie = 0;
2232 req.from = htonll(NBD_TRACELOG_FROM_MAGIC);
2233 req.len = htonl(TRUE);
2235 ret = writeit(client->transactionlogfd, &req, sizeof(struct nbd_request));
2236 if (ret < 0) {
2237 msg(LOG_INFO, "Could not write to transactionlog %s, moving on without it",
2238 client->server->transactionlog);
2239 close(client->transactionlogfd);
2240 client->transactionlogfd = -1;
2241 return;
2245 /* 3) Allocate the semaphore used for locking */
2246 ret = fstat(client->transactionlogfd, &fdinfo);
2247 if (ret == -1) {
2248 msg(LOG_INFO, "Could not stat transactionlog %s, moving on without it",
2249 client->server->transactionlog);
2250 close(client->transactionlogfd);
2251 client->transactionlogfd = -1;
2252 return;
2254 snprintf(client->semname, sizeof(client->semname), "/nbd-server-%llx-%llx",
2255 (unsigned long long)fdinfo.st_dev,
2256 (unsigned long long)fdinfo.st_ino);
2257 client->logsem = sem_open(client->semname, O_CREAT, 0600, 1);
2258 if (client->logsem == SEM_FAILED) {
2259 msg(LOG_INFO, "Could not allocate semaphore for transactionlog %s, moving on without it",
2260 client->server->transactionlog);
2261 close(client->transactionlogfd);
2262 client->transactionlogfd = -1;
2267 * Commit to exporting the chosen export
2269 * When a client sends NBD_OPT_EXPORT_NAME or NBD_OPT_GO, we need to do
2270 * a number of things (verify whether the client is allowed access, try
2271 * to open files, etc etc) before we're ready to actually serve the
2272 * export.
2274 * This function does all those things.
2276 * @param client the CLIENT structure with .server and .net members set
2277 * up correctly
2278 * @return true if the client is allowed access to the export, false
2279 * otherwise
2281 static bool commit_client(CLIENT* client, SERVER* server) {
2282 char acl;
2283 uint32_t len;
2285 client->server = serve_inc_ref(server);
2286 client->exportsize = OFFT_MAX;
2287 client->transactionlogfd = -1;
2288 if(pthread_mutex_init(&(client->lock), NULL)) {
2289 msg(LOG_ERR, "Unable to initialize mutex");
2290 return false;
2292 if (pthread_rwlock_init(&client->export_lock, NULL)){
2293 msg(LOG_ERR, "Unable to initialize write lock");
2294 return false;
2296 /* Check whether we exceeded the maximum number of allowed
2297 * clients already */
2298 if(dontfork) {
2299 acl = 'Y';
2300 } else {
2301 len = strlen(client->server->servename);
2302 writeit(commsocket, &len, sizeof len);
2303 writeit(commsocket, client->server->servename, len);
2304 readit(commsocket, &acl, 1);
2305 close(commsocket);
2307 switch(acl) {
2308 case 'N':
2309 msg(LOG_ERR, "Connection not allowed (too many clients)");
2310 return false;
2311 case 'X':
2312 msg(LOG_ERR, "Connection not allowed (unknown by parent?!?)");
2313 return false;
2316 /* Check whether the client is listed in the authfile */
2317 if (set_peername(client->net, client)) {
2318 msg(LOG_ERR, "Failed to set peername");
2319 return false;
2322 if (!authorized_client(client)) {
2323 msg(LOG_INFO, "Client '%s' is not authorized to access",
2324 client->clientname);
2325 return false;
2328 /* Set up the transactionlog, if we need one */
2329 if (client->server->transactionlog && (client->transactionlogfd == -1))
2330 setup_transactionlog(client);
2332 /* Run any pre scripts that we may need */
2333 if (do_run(client->server->prerun, client->exportname)) {
2334 msg(LOG_INFO, "Client '%s' not allowed access by prerun script",
2335 client->clientname);
2336 return false;
2338 client->socket_closed = socket_closed_transmission;
2339 if(!setupexport(client)) {
2340 return false;
2343 if (client->server->flags & F_COPYONWRITE) {
2344 if(!copyonwrite_prepare(client)) {
2345 return false;
2349 if (client->server->flags & F_WAIT) {
2350 if(!copyonwrite_prepare(client)) {
2351 return false;
2355 setmysockopt(client->net);
2357 return true;
2360 static CLIENT* handle_export_name(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2361 uint32_t namelen;
2362 char* name;
2363 int i;
2365 socket_read(client, &namelen, sizeof(namelen));
2366 namelen = ntohl(namelen);
2367 if(namelen > 4096) {
2368 return NULL;
2370 if(namelen > 0) {
2371 name = malloc(namelen+1);
2372 name[namelen]=0;
2373 socket_read(client, name, namelen);
2374 } else {
2375 name = strdup("");
2377 for(i=0; i<servers->len; i++) {
2378 SERVER* serve = (g_array_index(servers, SERVER*, i));
2379 // hide exports that are TLS-only if we haven't negotiated TLS
2380 // yet
2381 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2382 continue;
2384 if(!strcmp(serve->servename, name)) {
2385 client->clientfeats = cflags;
2386 free(name);
2387 if(!commit_client(client, serve)) {
2388 return NULL;
2390 send_export_info(client, serve, true);
2391 return client;
2394 free(name);
2395 err("Negotiation failed/8a: Requested export not found, or is TLS-only and client did not negotiate TLS");
2398 static void handle_list(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2399 uint32_t len;
2400 int i;
2401 char buf[1024];
2402 char *ptr = buf + sizeof(len);
2404 socket_read(client, &len, sizeof(len));
2405 len = ntohl(len);
2406 if(len) {
2407 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_LIST with nonzero data length is not a valid request");
2409 if(!(glob_flags & F_LIST)) {
2410 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Listing of exports denied by server configuration");
2411 err_nonfatal("Client tried disallowed list option");
2412 return;
2414 for(i=0; i<servers->len; i++) {
2415 SERVER* serve = (g_array_index(servers, SERVER*, i));
2416 // Hide TLS-only exports if we haven't negotiated TLS yet
2417 if(!client->tls_session && (serve->flags & F_FORCEDTLS)) {
2418 continue;
2420 len = htonl(strlen(serve->servename));
2421 memcpy(buf, &len, sizeof(len));
2422 strncpy(ptr, serve->servename, sizeof(buf) - sizeof(len));
2423 send_reply(client, opt, NBD_REP_SERVER, strlen(serve->servename)+sizeof(len), buf);
2425 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2428 #if HAVE_GNUTLS
2429 static int verify_cert(gnutls_session_t session) {
2430 int ret;
2431 unsigned int status, cert_list_size;
2432 const gnutls_datum_t *cert_list;
2433 gnutls_x509_crt_t cert;
2434 time_t now = time(NULL);
2436 ret = gnutls_certificate_verify_peers2(session, &status);
2437 if(ret < 0 || status != 0 || gnutls_certificate_type_get(session) !=
2438 GNUTLS_CRT_X509) {
2439 goto err;
2442 if(gnutls_x509_crt_init(&cert) < 0) {
2443 goto err;
2446 cert_list = gnutls_certificate_get_peers(session, &cert_list_size);
2447 if(cert_list == NULL) {
2448 goto err;
2450 if(gnutls_x509_crt_import(cert, &cert_list[0], GNUTLS_X509_FMT_DER) < 0) {
2451 goto err;
2453 if(gnutls_x509_crt_get_activation_time(cert) > now) {
2454 goto err;
2456 if(gnutls_x509_crt_get_expiration_time(cert) < now) {
2457 goto err;
2459 // TODO: check CRLs and/or OCSP etc. Patches welcome.
2460 msg(LOG_INFO, "client certificate verification successful");
2461 return 0;
2462 err:
2463 msg(LOG_ERR, "E: client certificate verification failed");
2464 return GNUTLS_E_CERTIFICATE_ERROR;
2467 CLIENT* handle_starttls(CLIENT* client, int opt, GArray* servers, uint32_t cflags, struct generic_conf *genconf) {
2468 #define check_rv(c) if((c)<0) { retval = NULL; goto exit; }
2469 gnutls_certificate_credentials_t x509_cred;
2470 CLIENT* retval = client;
2471 gnutls_priority_t priority_cache;
2472 gnutls_session_t *session = g_new0(gnutls_session_t, 1);
2473 int ret;
2474 int len;
2476 socket_read(client, &len, sizeof(len));
2477 if(G_UNLIKELY(len != 0)) {
2478 char buf[1024*1024];
2479 consume(client, len, buf, sizeof(buf));
2480 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Sending a STARTTLS command with data is invalid");
2481 return NULL;
2484 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2486 check_rv(gnutls_certificate_allocate_credentials(&x509_cred));
2487 gnutls_certificate_set_verify_function(x509_cred, verify_cert);
2488 check_rv(gnutls_certificate_set_x509_trust_file(x509_cred, genconf->cacertfile, GNUTLS_X509_FMT_PEM));
2489 check_rv(gnutls_certificate_set_x509_key_file(x509_cred, genconf->certfile, genconf->keyfile, GNUTLS_X509_FMT_PEM));
2490 check_rv(gnutls_priority_init(&priority_cache, genconf->tlsprio, NULL));
2491 check_rv(gnutls_init(session, GNUTLS_SERVER));
2492 check_rv(gnutls_priority_set(*session, priority_cache));
2493 check_rv(gnutls_credentials_set(*session, GNUTLS_CRD_CERTIFICATE, x509_cred));
2495 gnutls_certificate_server_set_request(*session, GNUTLS_CERT_REQUEST);
2496 #if GNUTLS_VERSION_NUMBER >= 0x030109
2497 gnutls_transport_set_int(*session, client->net);
2498 #else
2499 gnutls_transport_set_ptr(*session, (gnutls_transport_ptr_t) (intptr_t) client->net);
2500 #endif
2501 do {
2502 ret = gnutls_handshake(*session);
2503 } while(ret < 0 && gnutls_error_is_fatal(ret) == 0);
2505 if (ret < 0) {
2506 err_nonfatal(gnutls_strerror(ret));
2507 gnutls_bye(*session, GNUTLS_SHUT_RDWR);
2508 gnutls_deinit(*session);
2509 g_free(session);
2510 return NULL;
2512 client->tls_session = session;
2513 client->socket_read = socket_read_tls;
2514 client->socket_write = socket_write_tls;
2515 #undef check_rv
2516 exit:
2517 if(retval == NULL && session != NULL) {
2518 g_free(session);
2520 /* export names cannot be chosen before NBD_OPT_STARTTLS and be retained */
2521 if(retval != NULL && retval->server != NULL) {
2522 retval->server = NULL;
2524 return retval;
2526 #endif
2529 * Handle an NBD_OPT_STRUCTURED_REPLY message
2531 static void handle_structured_reply(CLIENT *client, uint32_t opt, GArray *servers, uint32_t cflags) {
2532 uint32_t len;
2533 int i;
2535 socket_read(client, &len, sizeof(len));
2536 len = ntohl(len);
2537 if(len) {
2538 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY with nonzero data length is not a valid request");
2539 char buf[1024];
2540 consume(client, len, buf, sizeof buf);
2541 return;
2543 if(client->clientflags & F_STRUCTURED) {
2544 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY has already been called");
2545 return;
2547 client->clientflags |= F_STRUCTURED;
2548 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2552 * Handle an NBD_OPT_INFO or NBD_OPT_GO request.
2554 static bool handle_info(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2555 uint32_t namelen, len;
2556 char *name;
2557 int i;
2558 SERVER *server = NULL;
2559 uint16_t n_requests;
2560 uint16_t request;
2561 char buf[1024];
2562 bool sent_export = false;
2563 uint32_t reptype = NBD_REP_ERR_UNKNOWN;
2564 char *msg = "Export unknown";
2566 socket_read(client, &len, sizeof(len));
2567 len = htonl(len);
2568 socket_read(client, &namelen, sizeof(namelen));
2569 namelen = htonl(namelen);
2570 if(namelen > (len - 6)) {
2571 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "An OPT_INFO request cannot be smaller than the length of the name + 6");
2572 consume(client, len - sizeof(namelen), buf, sizeof(buf));
2574 if(namelen > 4096) {
2575 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "The name for this OPT_INFO request is too long");
2576 consume(client, namelen, buf, sizeof(buf));
2578 if(namelen > 0) {
2579 name = malloc(namelen + 1);
2580 if (!name) {
2581 send_reply(client, opt, reptype, -1, "nbd server out of memory");
2582 return false;
2584 name[namelen] = 0;
2585 socket_read(client, name, namelen);
2586 } else {
2587 name = strdup("");
2589 for(i=0; i<servers->len; i++) {
2590 SERVER *serve = (g_array_index(servers, SERVER*, i));
2591 if (!strcmp(serve->servename, name)) {
2592 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2593 reptype = NBD_REP_ERR_TLS_REQD;
2594 msg = "TLS is required for that export";
2595 continue;
2597 server = serve;
2600 free(name);
2601 socket_read(client, &n_requests, sizeof(n_requests));
2602 n_requests = ntohs(n_requests);
2603 if(!server) {
2604 consume(client, n_requests * sizeof(request), buf,
2605 sizeof(buf));
2606 send_reply(client, opt, reptype, -1, msg);
2607 return false;
2609 if (opt == NBD_OPT_GO) {
2610 client->clientfeats = cflags;
2611 if(!commit_client(client, server)) {
2612 consume(client, n_requests * sizeof(request), buf,
2613 sizeof(buf));
2614 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Access denied by server configuration");
2615 return false;
2618 for(i=0; i<n_requests; i++) {
2619 socket_read(client, &request, sizeof(request));
2620 switch(ntohs(request)) {
2621 case NBD_INFO_EXPORT:
2622 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2623 socket_write(client, &request, 2);
2624 send_export_info(client, server, false);
2625 sent_export = true;
2626 break;
2627 default:
2628 // ignore all other options for now.
2629 break;
2632 if(!sent_export) {
2633 request = htons(NBD_INFO_EXPORT);
2634 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2635 socket_write(client, &request, 2);
2636 send_export_info(client, server, false);
2638 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2640 return true;
2644 * Do the initial negotiation.
2646 * @param net The socket we're doing the negotiation over.
2647 * @param servers The array of known servers.
2648 * @param genconf the global options (needed for accessing TLS config data)
2650 CLIENT* negotiate(int net, GArray* servers, struct generic_conf *genconf) {
2651 uint16_t smallflags = NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES;
2652 uint64_t magic;
2653 uint32_t cflags = 0;
2654 uint32_t opt;
2655 CLIENT* client = g_new0(CLIENT, 1);
2656 client->net = net;
2657 client->socket_read = socket_read_notls;
2658 client->socket_write = socket_write_notls;
2659 client->socket_closed = socket_closed_negotiate;
2660 client->transactionlogfd = -1;
2661 client->logsem = SEM_FAILED;
2663 assert(servers != NULL);
2664 socket_write(client, INIT_PASSWD, 8);
2665 magic = htonll(opts_magic);
2666 socket_write(client, &magic, sizeof(magic));
2668 smallflags = htons(smallflags);
2669 socket_write(client, &smallflags, sizeof(uint16_t));
2670 socket_read(client, &cflags, sizeof(cflags));
2671 cflags = htonl(cflags);
2672 if (cflags & NBD_FLAG_C_NO_ZEROES) {
2673 glob_flags |= F_NO_ZEROES;
2675 do {
2676 socket_read(client, &magic, sizeof(magic));
2677 magic = ntohll(magic);
2678 if(magic != opts_magic) {
2679 err_nonfatal("Negotiation failed/5a: magic mismatch");
2680 goto handler_err;
2682 socket_read(client, &opt, sizeof(opt));
2683 opt = ntohl(opt);
2684 if(client->tls_session == NULL
2685 && glob_flags & F_FORCEDTLS
2686 && opt != NBD_OPT_STARTTLS) {
2687 if(opt == NBD_OPT_EXPORT_NAME) {
2688 // can't send an error message for EXPORT_NAME,
2689 // so must do hard close
2690 goto handler_err;
2692 if(opt == NBD_OPT_ABORT) {
2693 // handled below
2694 break;
2696 consume_len(client);
2697 send_reply(client, opt, NBD_REP_ERR_TLS_REQD, -1, "TLS is required on this server");
2698 continue;
2700 switch(opt) {
2701 case NBD_OPT_EXPORT_NAME:
2702 // NBD_OPT_EXPORT_NAME must be the last
2703 // selected option, so return from here
2704 // if that is chosen.
2705 if(handle_export_name(client, opt, servers, cflags) != NULL) {
2706 return client;
2707 } else {
2708 goto handler_err;
2710 break;
2711 case NBD_OPT_LIST:
2712 handle_list(client, opt, servers, cflags);
2713 break;
2714 case NBD_OPT_ABORT:
2715 // handled below
2716 break;
2717 case NBD_OPT_STARTTLS:
2718 #if !HAVE_GNUTLS
2719 consume_len(client);
2720 send_reply(client, opt, NBD_REP_ERR_PLATFORM, -1, "This nbd-server was compiled without TLS support");
2721 #else
2722 if(client->tls_session != NULL) {
2723 consume_len(client);
2724 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Invalid STARTTLS request: TLS has already been negotiated!");
2725 continue;
2727 if(genconf->keyfile == NULL) {
2728 consume_len(client);
2729 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "TLS not allowed on this server");
2730 continue;
2732 if(handle_starttls(client, opt, servers, cflags, genconf) == NULL) {
2733 // can't recover from failed TLS negotiation.
2734 goto handler_err;
2736 // once TLS has been negotiated, any state must be cleared
2737 client->clientflags = 0;
2738 #endif
2739 break;
2740 case NBD_OPT_GO:
2741 case NBD_OPT_INFO:
2742 if(handle_info(client, opt, servers, cflags) && opt == NBD_OPT_GO) {
2743 return client;
2745 break;
2746 case NBD_OPT_STRUCTURED_REPLY:
2747 handle_structured_reply(client, opt, servers, cflags);
2748 break;
2749 default:
2750 consume_len(client);
2751 send_reply(client, opt, NBD_REP_ERR_UNSUP, -1, "The given option is unknown to this server implementation");
2752 break;
2754 } while((opt != NBD_OPT_EXPORT_NAME) && (opt != NBD_OPT_ABORT));
2755 if(opt == NBD_OPT_ABORT) {
2756 err_nonfatal("Session terminated by client");
2757 goto handler_err;
2759 err_nonfatal("Weird things happened: reached end of negotiation without success");
2760 handler_err:
2761 g_free(client);
2762 return NULL;
2765 static int nbd_errno(int errcode) {
2766 switch (errcode) {
2767 case EPERM:
2768 return htonl(1);
2769 case EIO:
2770 return htonl(5);
2771 case ENOMEM:
2772 return htonl(12);
2773 case EINVAL:
2774 return htonl(22);
2775 case EFBIG:
2776 case ENOSPC:
2777 #ifdef EDQUOT
2778 case EDQUOT:
2779 #endif
2780 return htonl(28); // ENOSPC
2781 default:
2782 return htonl(22); // EINVAL
2786 static void package_dispose(struct work_package* package) {
2787 if (package->pipefd[0] > 0)
2788 close(package->pipefd[0]);
2789 if (package->pipefd[1] > 0)
2790 close(package->pipefd[1]);
2791 g_free(package->data);
2792 g_free(package->req);
2793 g_free(package);
2796 static int mkpipe(int pipefd[2], size_t len)
2798 if (len > MAX_PIPE_SIZE)
2799 return -1;
2800 if (pipe(pipefd))
2801 return -1;
2803 #ifdef HAVE_SPLICE
2804 if (fcntl(pipefd[1], F_SETPIPE_SZ, MAX_PIPE_SIZE) < MAX_PIPE_SIZE) {
2805 close(pipefd[0]);
2806 close(pipefd[1]);
2807 pipefd[0] = -1;
2808 pipefd[1] = -1;
2809 return -1;
2811 #endif
2813 return 0;
2816 struct work_package* package_create(CLIENT* client, struct nbd_request* req) {
2817 struct work_package* rv = calloc(sizeof (struct work_package), 1);
2819 rv->req = req;
2820 rv->client = client;
2821 rv->data = NULL;
2822 rv->pipefd[0] = -1;
2823 rv->pipefd[1] = -1;
2825 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
2826 if (client->server->flags & F_SPLICE) {
2827 if (mkpipe(rv->pipefd, req->len))
2828 rv->data = malloc(req->len);
2829 } else {
2830 rv->data = malloc(req->len);
2834 return rv;
2837 #ifdef HAVE_SPLICE
2838 static int handle_splice_read(CLIENT *client, struct nbd_request *req)
2840 struct nbd_reply rep;
2841 int pipefd[2];
2843 // splice doesn't work with TLS
2844 if (client->tls_session != NULL)
2845 return -1;
2847 if (mkpipe(pipefd, req->len))
2848 return -1;
2850 if (expsplice(pipefd[1], req->from, req->len, client, SPLICE_IN, 0)) {
2851 close(pipefd[1]);
2852 close(pipefd[0]);
2853 return -1;
2856 DEBUG("handling read request (splice)\n");
2857 setup_reply(&rep, req);
2858 log_reply(client, &rep);
2859 pthread_mutex_lock(&(client->lock));
2860 writeit(client->net, &rep, sizeof(rep));
2861 spliceit(pipefd[0], NULL, client->net, NULL, req->len);
2862 pthread_mutex_unlock(&(client->lock));
2863 close(pipefd[0]);
2864 close(pipefd[1]);
2865 return 0;
2867 #endif
2869 static void handle_normal_read(CLIENT *client, struct nbd_request *req)
2871 DEBUG("handling read request\n");
2872 char read_failed[] = "Read failed";
2873 _cleanup_g_free_ READ_CTX *ctx = g_new0(READ_CTX, 1);
2874 ctx->req = req;
2875 ctx->current_len = req->len;
2876 uint32_t error = 0;
2877 char *errmsg = NULL;
2878 uint16_t msglen = 0;
2879 if(client->clientflags & F_STRUCTURED) {
2880 ctx->is_structured = 1;
2881 } else {
2882 ctx->is_structured = 0;
2884 if((req->type & NBD_CMD_FLAG_DF) != 0) {
2885 ctx->df = 1;
2887 if(ctx->is_structured && ctx->df && req->len > (1 << 20)) {
2888 /* standard requires a minimum of 64KiB; we are more generous
2889 * by allowing up to 1MiB as our largest unfragmented answer */
2890 const char too_long[] = "Request too long for unfragmented reply";
2891 struct nbd_structured_error_payload pl;
2892 pl.error = NBD_EOVERFLOW;
2893 pl.msglen = sizeof too_long;
2894 send_structured_chunk_v(client, req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, 6 + pl.msglen, 2, &pl, sizeof pl, too_long, sizeof too_long);
2895 return;
2897 if(ctx->df || !(ctx->is_structured)) {
2898 ctx->buf = malloc(req->len);
2899 if(!(ctx->buf)) {
2900 err("Could not allocate memory for request");
2902 ctx->buflen = req->len;
2904 if(expread(ctx, client)) {
2905 DEBUG("Read failed: %m");
2906 error = nbd_errno(errno);
2907 errmsg = read_failed;
2908 msglen = sizeof read_failed;
2910 complete_read(client, ctx, error, errmsg, msglen, false, 0);
2913 static void handle_read(CLIENT* client, struct nbd_request* req)
2915 #ifdef HAVE_SPLICE
2917 * If we have splice set we want to try that first, and if that fails
2918 * for whatever reason we fall through to ye olde read.
2920 if (client->server->flags & F_SPLICE)
2921 if (!handle_splice_read(client, req))
2922 return;
2923 #endif
2924 handle_normal_read(client, req);
2927 static void handle_write(struct work_package *pkg)
2929 CLIENT *client = pkg->client;
2930 struct nbd_request *req = pkg->req;
2931 struct nbd_reply rep;
2932 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2934 DEBUG("handling write request\n");
2935 setup_reply(&rep, req);
2937 #ifdef HAVE_SPLICE
2938 if (!pkg->data) {
2939 if (expsplice(pkg->pipefd[0], req->from, req->len, client,
2940 SPLICE_OUT, fua)) {
2941 DEBUG("Splice failed: %m");
2942 rep.error = nbd_errno(errno);
2944 } else
2945 #endif
2947 if(expwrite(req->from, pkg->data, req->len, client, fua)) {
2948 DEBUG("Write failed: %m");
2949 rep.error = nbd_errno(errno);
2952 log_reply(client, &rep);
2953 pthread_mutex_lock(&(client->lock));
2954 socket_write(client, &rep, sizeof rep);
2955 pthread_mutex_unlock(&(client->lock));
2958 static void handle_flush(CLIENT* client, struct nbd_request* req) {
2959 struct nbd_reply rep;
2960 DEBUG("handling flush request\n");
2961 setup_reply(&rep, req);
2962 if(expflush(client)) {
2963 DEBUG("Flush failed: %m");
2964 rep.error = nbd_errno(errno);
2966 log_reply(client, &rep);
2967 pthread_mutex_lock(&(client->lock));
2968 socket_write(client, &rep, sizeof rep);
2969 pthread_mutex_unlock(&(client->lock));
2972 static void handle_trim(CLIENT* client, struct nbd_request* req) {
2973 struct nbd_reply rep;
2974 DEBUG("handling trim request\n");
2975 setup_reply(&rep, req);
2976 if(exptrim(req, client)) {
2977 DEBUG("Trim failed: %m");
2978 rep.error = nbd_errno(errno);
2980 log_reply(client, &rep);
2981 pthread_mutex_lock(&(client->lock));
2982 socket_write(client, &rep, sizeof rep);
2983 pthread_mutex_unlock(&(client->lock));
2986 static void handle_write_zeroes(CLIENT* client, struct nbd_request* req) {
2987 struct nbd_reply rep;
2988 DEBUG("handling write_zeroes request\n");
2989 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2990 setup_reply(&rep, req);
2991 if(expwrite_zeroes(req, client, fua)) {
2992 DEBUG("Write_zeroes failed: %m");
2993 rep.error = nbd_errno(errno);
2995 // For now, don't trim
2996 // TODO: handle this far more efficiently with reference to the
2997 // actual backing driver
2998 log_reply(client, &rep);
2999 pthread_mutex_lock(&(client->lock));
3000 socket_write(client, &rep, sizeof rep);
3001 pthread_mutex_unlock(&(client->lock));
3005 static bool bad_write(CLIENT* client, struct nbd_request* req) {
3006 if ((client->server->flags & F_READONLY) ||
3007 (client->server->flags & F_AUTOREADONLY)) {
3008 DEBUG("[WRITE to READONLY!]");
3009 return true;
3011 return false;
3014 static bool bad_range(CLIENT* client, struct nbd_request* req) {
3015 if(req->from > client->exportsize ||
3016 req->from + req->len > client->exportsize) {
3017 DEBUG("[out of bounds!]");
3018 return true;
3020 return false;
3023 static void handle_request(gpointer data, gpointer user_data) {
3024 struct work_package* package = (struct work_package*) data;
3025 uint32_t type = package->req->type & NBD_CMD_MASK_COMMAND;
3026 uint32_t flags = package->req->type & ~NBD_CMD_MASK_COMMAND;
3027 struct nbd_reply rep;
3028 int err = EINVAL;
3030 if(flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
3031 msg(LOG_ERR, "E: received invalid flag %d on command %d, ignoring", flags, type);
3032 goto error;
3035 switch(type) {
3036 case NBD_CMD_READ:
3037 if (bad_range(package->client, package->req)) {
3038 goto error;
3040 handle_read(package->client, package->req);
3041 break;
3042 case NBD_CMD_WRITE:
3043 if (bad_write(package->client, package->req)) {
3044 err = EPERM;
3045 goto error;
3047 if (bad_range(package->client, package->req)) {
3048 err = ENOSPC;
3049 goto error;
3051 handle_write(package);
3052 break;
3053 case NBD_CMD_FLUSH:
3054 handle_flush(package->client, package->req);
3055 break;
3056 case NBD_CMD_TRIM:
3057 if (bad_write(package->client, package->req)) {
3058 err = EPERM;
3059 goto error;
3061 if (bad_range(package->client, package->req)) {
3062 goto error;
3064 handle_trim(package->client, package->req);
3065 break;
3066 case NBD_CMD_WRITE_ZEROES:
3067 if (bad_write(package->client, package->req)) {
3068 err = EPERM;
3069 goto error;
3071 if (bad_range(package->client, package->req)) {
3072 err = ENOSPC;
3073 goto error;
3075 handle_write_zeroes(package->client, package->req);
3076 break;
3077 default:
3078 msg(LOG_ERR, "E: received unknown command %d of type, ignoring", package->req->type);
3079 goto error;
3081 goto end;
3082 error:
3083 setup_reply(&rep, package->req);
3084 rep.error = nbd_errno(err);
3085 log_reply(package->client, &rep);
3086 pthread_mutex_lock(&(package->client->lock));
3087 socket_write(package->client, &rep, sizeof rep);
3088 pthread_mutex_unlock(&(package->client->lock));
3089 end:
3090 package_dispose(package);
3093 static int mainloop_threaded(CLIENT* client) {
3094 struct nbd_request* req;
3095 struct work_package* pkg;
3096 int write_data = false;
3098 DEBUG("Entering request loop\n");
3099 while(1) {
3100 req = calloc(sizeof (struct nbd_request), 1);
3102 socket_read(client, req, sizeof(struct nbd_request));
3104 if(client->transactionlogfd != -1) {
3105 lock_logsem(client);
3106 writeit(client->transactionlogfd, req, sizeof(struct nbd_request));
3107 if(((ntohl(req->type) & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) &&
3108 (client->server->flags & F_DATALOG) &&
3109 !(client->server->flags & F_SPLICE)) {
3110 write_data = true;
3111 } else {
3112 write_data = false;
3113 unlock_logsem(client);
3117 req->from = ntohll(req->from);
3118 req->type = ntohl(req->type);
3119 req->len = ntohl(req->len);
3121 if(req->magic != htonl(NBD_REQUEST_MAGIC))
3122 err("Protocol error: not enough magic.");
3124 pkg = package_create(client, req);
3126 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
3127 #ifdef HAVE_SPLICE
3128 if ((client->server->flags & F_SPLICE) &&
3129 (req->len <= MAX_PIPE_SIZE && pkg->pipefd[1] > 0) &&
3130 (client->tls_session == NULL))
3131 spliceit(client->net, NULL, pkg->pipefd[1],
3132 NULL, req->len);
3133 else
3134 #endif
3135 socket_read(client, pkg->data, req->len);
3137 if (write_data) {
3138 writeit(client->transactionlogfd, pkg->data, req->len);
3139 unlock_logsem(client);
3140 write_data = false;
3143 if(req->type == NBD_CMD_DISC) {
3144 finalize_client(client);
3145 package_dispose(pkg);
3146 return 0;
3148 g_thread_pool_push(tpool, pkg, NULL);
3153 * Destroy a pid_t*
3154 * @param data a pointer to pid_t which should be freed
3156 void destroy_pid_t(gpointer data) {
3157 g_free(data);
3160 static pid_t spawn_child(int* socket) {
3161 pid_t pid;
3162 sigset_t newset;
3163 sigset_t oldset;
3164 int sockets[2];
3166 sigemptyset(&newset);
3167 sigaddset(&newset, SIGCHLD);
3168 sigaddset(&newset, SIGTERM);
3169 sigprocmask(SIG_BLOCK, &newset, &oldset);
3170 socketpair(AF_UNIX, SOCK_STREAM, 0, sockets);
3171 pid = fork();
3172 if (pid < 0) {
3173 msg(LOG_ERR, "Could not fork (%s)", strerror(errno));
3174 close(sockets[0]);
3175 close(sockets[1]);
3176 goto out;
3178 if (pid > 0) { /* Parent */
3179 pid_t *pidp;
3181 pidp = g_malloc(sizeof(pid_t));
3182 *pidp = pid;
3183 *socket = sockets[1];
3184 close(sockets[0]);
3185 g_hash_table_insert(children, pidp, pidp);
3186 goto out;
3188 /* Child */
3189 *socket = sockets[0];
3190 close(sockets[1]);
3191 /* Child's signal disposition is reset to default. */
3192 signal(SIGCHLD, SIG_DFL);
3193 signal(SIGTERM, SIG_DFL);
3194 signal(SIGHUP, SIG_DFL);
3195 sigemptyset(&oldset);
3196 out:
3197 sigprocmask(SIG_SETMASK, &oldset, NULL);
3198 return pid;
3201 static int
3202 socket_accept(const int sock)
3204 struct sockaddr_storage addrin;
3205 socklen_t addrinlen = sizeof(addrin);
3206 int net;
3208 net = accept(sock, (struct sockaddr *) &addrin, &addrinlen);
3209 if (net < 0) {
3210 err_nonfatal("Failed to accept socket connection: %m");
3213 return net;
3216 static void
3217 handle_modern_connection(GArray *const servers, const int sock, struct generic_conf *genconf)
3219 int net;
3220 pid_t pid;
3221 CLIENT *client = NULL;
3222 int sock_flags_old;
3223 int sock_flags_new;
3225 net = socket_accept(sock);
3226 if (net < 0)
3227 return;
3229 if (!dontfork) {
3230 pid = spawn_child(&commsocket);
3231 if (pid) {
3232 if (pid > 0) {
3233 msg(LOG_INFO, "Spawned a child process");
3234 g_array_append_val(childsocks, commsocket);
3236 if (pid < 0)
3237 msg(LOG_ERR, "Failed to spawn a child process");
3238 close(net);
3239 return;
3241 /* Child just continues. */
3243 tpool = g_thread_pool_new(handle_request, NULL, genconf->threads, FALSE, NULL);
3245 sock_flags_old = fcntl(net, F_GETFL, 0);
3246 if (sock_flags_old == -1) {
3247 msg(LOG_ERR, "Failed to get socket flags");
3248 goto handler_err;
3251 sock_flags_new = sock_flags_old & ~O_NONBLOCK;
3252 if (sock_flags_new != sock_flags_old &&
3253 fcntl(net, F_SETFL, sock_flags_new) == -1) {
3254 msg(LOG_ERR, "Failed to set socket to blocking mode");
3255 goto handler_err;
3258 client = negotiate(net, servers, genconf);
3259 if (!client) {
3260 msg(LOG_ERR, "Modern initial negotiation failed");
3261 goto handler_err;
3264 if (!dontfork) {
3265 int i;
3267 /* Free all root server resources here, because we are
3268 * currently in the child process serving one specific
3269 * connection. These are not simply needed anymore. */
3270 g_hash_table_destroy(children);
3271 children = NULL;
3272 for (i = 0; i < modernsocks->len; i++) {
3273 close(g_array_index(modernsocks, int, i));
3275 g_array_free(modernsocks, TRUE);
3277 /* Now that we are in the child process after a
3278 * succesful negotiation, we do not need the list of
3279 * servers anymore, get rid of it.*/
3280 g_array_free(servers, FALSE);
3283 msg(LOG_INFO, "Starting to serve");
3284 mainloop_threaded(client);
3285 exit(EXIT_SUCCESS);
3287 handler_err:
3288 close(net);
3289 g_free(client);
3291 if (!dontfork) {
3292 exit(EXIT_FAILURE);
3296 static int handle_childname(GArray* servers, int socket)
3298 uint32_t len;
3299 _cleanup_g_free_ char *buf = NULL;
3300 int i, r, rt = 0;
3302 while(rt < sizeof(len)) {
3303 switch((r = read(socket, &len, sizeof len))) {
3304 case 0:
3305 return -1;
3306 case -1:
3307 err_nonfatal("Error reading from acl socket: %m");
3308 return -1;
3309 default:
3310 rt += r;
3311 break;
3314 if (len >= UINT32_MAX - 1) {
3315 err_nonfatal("Value out of range");
3316 return -1;
3318 buf = g_malloc0(len + 1);
3319 readit(socket, buf, len);
3320 buf[len] = 0;
3321 for(i=0; i<servers->len; i++) {
3322 SERVER* srv = g_array_index(servers, SERVER*, i);
3323 if(strcmp(srv->servename, buf) == 0) {
3324 if(srv->max_connections == 0 || srv->max_connections > srv->numclients) {
3325 writeit(socket, "Y", 1);
3326 srv->numclients++;
3327 } else {
3328 writeit(socket, "N", 1);
3330 goto exit;
3333 writeit(socket, "X", 1);
3334 exit:
3335 return 0;
3339 * Return the index of the server whose servename matches the given
3340 * name.
3342 * @param servename a string to match
3343 * @param servers an array of servers
3344 * @return the first index of the server whose servename matches the
3345 * given name or -1 if one cannot be found
3347 static int get_index_by_servename(const gchar *const servename,
3348 const GArray *const servers) {
3349 int i;
3351 for (i = 0; i < servers->len; ++i) {
3352 const SERVER* server = g_array_index(servers, SERVER*, i);
3354 if (strcmp(servename, server->servename) == 0)
3355 return i;
3358 return -1;
3362 * Parse configuration files and add servers to the array if they don't
3363 * already exist there. The existence is tested by comparing
3364 * servenames. A server is appended to the array only if its servename
3365 * is unique among all other servers.
3367 * @param servers an array of servers
3368 * @param genconf a pointer to generic configuration
3369 * @return the number of new servers appended to the array, or -1 in
3370 * case of an error
3372 static int append_new_servers(GArray *const servers, struct generic_conf *genconf, GError **const gerror) {
3373 int i;
3374 GArray *new_servers;
3375 const int old_len = servers->len;
3376 int retval = -1;
3378 new_servers = parse_cfile(config_file_pos, genconf, true, gerror);
3379 if(tpool) g_thread_pool_set_max_threads(tpool, genconf->threads, NULL);
3380 if(!new_servers)
3381 goto out;
3383 for(i = 0; i < new_servers->len; ++i) {
3384 SERVER *new_server = g_array_index(new_servers, SERVER*, i);
3386 if (new_server->servename
3387 && -1 == get_index_by_servename(new_server->servename,
3388 servers)) {
3389 serve_inc_ref(new_server);
3390 g_array_append_val(servers, new_server);
3394 retval = servers->len - old_len;
3395 out:
3396 g_array_free(new_servers, TRUE);
3398 return retval;
3401 void serveloop(GArray* servers, struct generic_conf *genconf) G_GNUC_NORETURN;
3403 * Loop through the available servers, and serve them. Never returns.
3405 void serveloop(GArray* servers, struct generic_conf *genconf) {
3406 int i;
3407 int mmax, max;
3408 fd_set mset;
3409 fd_set rset;
3410 sigset_t blocking_mask;
3411 sigset_t original_mask;
3414 * Set up the master fd_set. The set of descriptors we need
3415 * to select() for never changes anyway and it buys us a *lot*
3416 * of time to only build this once. However, if we ever choose
3417 * to not fork() for clients anymore, we may have to revisit
3418 * this.
3420 mmax=0;
3421 FD_ZERO(&mset);
3422 for(i=0;i<modernsocks->len;i++) {
3423 int sock = g_array_index(modernsocks, int, i);
3424 FD_SET(sock, &mset);
3425 mmax=sock>mmax?sock:mmax;
3428 /* Construct a signal mask which is used to make signal testing and
3429 * receiving an atomic operation to ensure no signal is received between
3430 * tests and blocking pselect(). */
3431 if (sigemptyset(&blocking_mask) == -1)
3432 err("failed to initialize blocking_mask: %m");
3434 if (sigaddset(&blocking_mask, SIGCHLD) == -1)
3435 err("failed to add SIGCHLD to blocking_mask: %m");
3437 if (sigaddset(&blocking_mask, SIGHUP) == -1)
3438 err("failed to add SIGHUP to blocking_mask: %m");
3440 if (sigaddset(&blocking_mask, SIGTERM) == -1)
3441 err("failed to add SIGTERM to blocking_mask: %m");
3443 if (sigprocmask(SIG_BLOCK, &blocking_mask, &original_mask) == -1)
3444 err("failed to block signals: %m");
3446 for(;;) {
3447 if (is_sigterm_caught) {
3448 is_sigterm_caught = 0;
3450 g_hash_table_foreach(children, killchild, NULL);
3451 unlink(pidfname);
3453 exit(EXIT_SUCCESS);
3456 if (is_sigchld_caught) {
3457 int status;
3458 int* i;
3459 pid_t pid;
3461 is_sigchld_caught = 0;
3463 while ((pid=waitpid(-1, &status, WNOHANG)) > 0) {
3464 if (WIFEXITED(status)) {
3465 msg(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
3467 i = g_hash_table_lookup(children, &pid);
3468 if (!i) {
3469 msg(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
3470 } else {
3471 DEBUG("Removing %d from the list of children", pid);
3472 g_hash_table_remove(children, &pid);
3477 /* SIGHUP causes the root server process to reconfigure
3478 * itself and add new export servers for each newly
3479 * found export configuration group, i.e. spawn new
3480 * server processes for each previously non-existent
3481 * export. This does not alter old runtime configuration
3482 * but just appends new exports. */
3483 if (is_sighup_caught) {
3484 int n;
3485 GError *gerror = NULL;
3487 msg(LOG_INFO, "reconfiguration request received");
3488 is_sighup_caught = 0; /* Reset to allow catching
3489 * it again. */
3491 n = append_new_servers(servers, genconf, &gerror);
3492 if (n == -1)
3493 msg(LOG_ERR, "failed to append new servers: %s",
3494 gerror->message);
3496 for (i = servers->len - n; i < servers->len; ++i) {
3497 const SERVER *server = g_array_index(servers,
3498 SERVER*, i);
3500 msg(LOG_INFO, "reconfigured new server: %s",
3501 server->servename);
3505 memcpy(&rset, &mset, sizeof(fd_set));
3506 max=mmax;
3507 for(i=0;i<childsocks->len;i++) {
3508 int sock = g_array_index(childsocks, int, i);
3509 FD_SET(sock, &rset);
3510 max=sock>max?sock:max;
3513 if (pselect(max + 1, &rset, NULL, NULL, NULL, &original_mask) > 0) {
3514 DEBUG("accept, ");
3515 for(i=0; i < modernsocks->len; i++) {
3516 int sock = g_array_index(modernsocks, int, i);
3517 if(!FD_ISSET(sock, &rset)) {
3518 continue;
3521 handle_modern_connection(servers, sock, genconf);
3523 for(i=0; i < childsocks->len; i++) {
3524 int sock = g_array_index(childsocks, int, i);
3526 if(FD_ISSET(sock, &rset)) {
3527 if(handle_childname(servers, sock) < 0) {
3528 close(sock);
3529 g_array_remove_index(childsocks, i);
3538 * Set server socket options.
3540 * @param socket a socket descriptor of the server
3542 * @param gerror a pointer to an error object pointer used for reporting
3543 * errors. On error, if gerror is not NULL, *gerror is set and -1
3544 * is returned.
3546 * @return 0 on success, -1 on error
3548 int dosockopts(const int socket, GError **const gerror) {
3549 #ifndef sun
3550 int yes=1;
3551 #else
3552 char yes='1';
3553 #endif /* sun */
3554 struct linger l;
3556 /* lose the pesky "Address already in use" error message */
3557 if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
3558 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_REUSEADDR,
3559 "failed to set socket option SO_REUSEADDR: %s",
3560 strerror(errno));
3561 return -1;
3563 l.l_onoff = 1;
3564 l.l_linger = 10;
3565 if (setsockopt(socket,SOL_SOCKET,SO_LINGER,&l,sizeof(l)) == -1) {
3566 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_LINGER,
3567 "failed to set socket option SO_LINGER: %s",
3568 strerror(errno));
3569 return -1;
3571 if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
3572 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_KEEPALIVE,
3573 "failed to set socket option SO_KEEPALIVE: %s",
3574 strerror(errno));
3575 return -1;
3578 return 0;
3581 int open_unix(const gchar *const sockname, GError **const gerror) {
3582 struct sockaddr_un sa;
3583 int sock=-1;
3584 int retval=-1;
3586 memset(&sa, 0, sizeof(struct sockaddr_un));
3587 sa.sun_family = AF_UNIX;
3588 strncpy(sa.sun_path, sockname, sizeof sa.sun_path);
3589 sa.sun_path[sizeof(sa.sun_path)-1] = '\0';
3590 sock = socket(AF_UNIX, SOCK_STREAM, 0);
3591 if(sock < 0) {
3592 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3593 "failed to open a unix socket: "
3594 "failed to create socket: %s",
3595 strerror(errno));
3596 goto out;
3598 if(bind(sock, (struct sockaddr*)&sa, sizeof(struct sockaddr_un))<0) {
3599 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3600 "failed to open a unix socket: "
3601 "failed to bind to address %s: %s",
3602 sockname, strerror(errno));
3603 goto out;
3605 if(listen(sock, 10)<0) {
3606 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3607 "failed to open a unix socket: "
3608 "failed to start listening: %s",
3609 strerror(errno));
3610 goto out;
3612 retval=0;
3613 g_array_append_val(modernsocks, sock);
3614 out:
3615 if(retval<0 && sock >= 0) {
3616 close(sock);
3619 return retval;
3622 int open_modern(const gchar *const addr, const gchar *const port,
3623 GError **const gerror) {
3624 struct addrinfo hints;
3625 struct addrinfo* ai = NULL;
3626 struct addrinfo* ai_bak = NULL;
3627 struct sock_flags;
3628 int e;
3629 int retval = -1;
3630 int sock = -1;
3631 _cleanup_(g_strfreevp) gchar** addrs;
3632 gchar const* l_addr = addr;
3634 if(!addr || strlen(addr) == 0) {
3635 l_addr = "::, 0.0.0.0";
3638 addrs = g_strsplit_set(l_addr, ", \t", -1);
3640 for(int i=0; addrs[i]!=NULL; i++) {
3641 if(addrs[i][0] == '\0') {
3642 continue;
3644 memset(&hints, '\0', sizeof(hints));
3645 hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
3646 hints.ai_socktype = SOCK_STREAM;
3647 hints.ai_family = AF_UNSPEC;
3648 hints.ai_protocol = IPPROTO_TCP;
3649 e = getaddrinfo(addrs[i], port ? port : NBD_DEFAULT_PORT, &hints, &ai);
3650 ai_bak = ai;
3651 if(e != 0 && addrs[i+1] == NULL && modernsocks->len == 0) {
3652 g_set_error(gerror, NBDS_ERR, NBDS_ERR_GAI,
3653 "failed to open a modern socket: "
3654 "failed to get address info: %s",
3655 gai_strerror(e));
3656 goto out;
3659 while(ai != NULL) {
3660 sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
3661 if(sock<0) {
3662 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3663 "failed to open a modern socket: "
3664 "failed to create a socket: %s",
3665 strerror(errno));
3666 goto out;
3669 if (dosockopts(sock, gerror) == -1) {
3670 g_prefix_error(gerror, "failed to open a modern socket: ");
3671 goto out;
3674 if(bind(sock, ai->ai_addr, ai->ai_addrlen)) {
3676 * Some systems will return multiple entries for the
3677 * same address when we ask it for something
3678 * AF_UNSPEC, even though the first entry will
3679 * listen to both protocols. Other systems will
3680 * return multiple entries too, but we actually
3681 * do need to open both.
3683 * Handle this by ignoring EADDRINUSE if we've
3684 * already got at least one socket open
3686 if(errno == EADDRINUSE && modernsocks->len > 0) {
3687 goto next;
3689 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3690 "failed to open a modern socket: "
3691 "failed to bind an address to a socket: %s",
3692 strerror(errno));
3693 goto out;
3696 if(listen(sock, 10) <0) {
3697 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3698 "failed to open a modern socket: "
3699 "failed to start listening on a socket: %s",
3700 strerror(errno));
3701 goto out;
3703 g_array_append_val(modernsocks, sock);
3704 next:
3705 ai = ai->ai_next;
3707 if(ai_bak) {
3708 freeaddrinfo(ai_bak);
3709 ai_bak=NULL;
3713 retval = 0;
3714 out:
3716 if (retval == -1 && sock >= 0) {
3717 close(sock);
3719 if(ai_bak)
3720 freeaddrinfo(ai_bak);
3722 return retval;
3726 * Connect our servers.
3728 void setup_servers(GArray *const servers, const gchar *const modernaddr,
3729 const gchar *const modernport, const gchar* unixsock,
3730 const gint flags ) {
3731 struct sigaction sa;
3733 if(unixsock != NULL) {
3734 GError* gerror = NULL;
3735 if(open_unix(unixsock, &gerror) == -1) {
3736 msg(LOG_ERR, "failed to setup servers: %s",
3737 gerror->message);
3738 g_clear_error(&gerror);
3739 exit(EXIT_FAILURE);
3742 if (((flags & F_DUAL_LISTEN) != 0) || (unixsock == NULL)) {
3743 GError *gerror = NULL;
3744 if (open_modern(modernaddr, modernport, &gerror) == -1) {
3745 msg(LOG_ERR, "failed to setup servers: %s",
3746 gerror->message);
3747 g_clear_error(&gerror);
3748 exit(EXIT_FAILURE);
3751 children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
3753 sa.sa_handler = sigchld_handler;
3754 sigemptyset(&sa.sa_mask);
3755 sigaddset(&sa.sa_mask, SIGTERM);
3756 sa.sa_flags = SA_RESTART;
3757 if(sigaction(SIGCHLD, &sa, NULL) == -1)
3758 err("sigaction: %m");
3760 sa.sa_handler = sigterm_handler;
3761 sigemptyset(&sa.sa_mask);
3762 sigaddset(&sa.sa_mask, SIGCHLD);
3763 sa.sa_flags = SA_RESTART;
3764 if(sigaction(SIGTERM, &sa, NULL) == -1)
3765 err("sigaction: %m");
3767 sa.sa_handler = sighup_handler;
3768 sigemptyset(&sa.sa_mask);
3769 sa.sa_flags = SA_RESTART;
3770 if(sigaction(SIGHUP, &sa, NULL) == -1)
3771 err("sigaction: %m");
3773 sa.sa_handler = sigusr1_handler;
3774 sigemptyset(&sa.sa_mask);
3775 sa.sa_flags = SA_RESTART;
3776 if(sigaction(SIGUSR1, &sa, NULL) == -1)
3777 err("sigaction: %m");
3781 * Go daemon (unless we specified at compile time that we didn't want this)
3783 #if !defined(NODAEMON)
3784 void daemonize() {
3785 pid_t child=fork();
3786 if(child < 0) {
3787 err("fork");
3788 } else if(child > 0) {
3789 exit(EXIT_SUCCESS);
3790 } else {
3791 if(setsid() < 0) {
3792 err("setsid");
3795 if(chdir("/")<0) {
3796 err("chdir");
3798 if(!*pidfname) {
3799 strncpy(pidfname, "/var/run/nbd-server.pid", 255);
3801 int newfd;
3802 if((newfd = open("/dev/null", O_RDWR)) < 0) {
3803 err("open");
3805 if(dup2(0, newfd) < 0) {
3806 err("dup2 stdin");
3808 if(dup2(1, newfd) < 0) {
3809 err("dup2 stdout");
3811 if(dup2(2, newfd) < 0) {
3812 err("dup2 stderr");
3814 child=fork();
3815 if(child < 0) {
3816 err("fork");
3817 } else if(child > 0) {
3818 exit(EXIT_SUCCESS);
3820 FILE*pidf=fopen(pidfname, "w");
3821 if(pidf) {
3822 fprintf(pidf,"%d\n", (int)getpid());
3823 fclose(pidf);
3824 } else {
3825 perror("fopen");
3826 fprintf(stderr, "Not fatal; continuing");
3829 #else
3830 #define daemonize()
3831 #endif /* !defined(NODAEMON) */
3834 * Everything beyond this point (in the file) is run in non-daemon mode.
3835 * The stuff above daemonize() isn't.
3839 * Set up user-ID and/or group-ID
3841 void dousers(const gchar *const username, const gchar *const groupname) {
3842 struct passwd *pw;
3843 struct group *gr;
3844 gchar* str;
3845 if (groupname) {
3846 gr = getgrnam(groupname);
3847 if(!gr) {
3848 str = g_strdup_printf("Invalid group name: %s", groupname);
3849 err(str);
3851 if(setgid(gr->gr_gid)<0) {
3852 err("Could not set GID: %m");
3855 if (username) {
3856 pw = getpwnam(username);
3857 if(!pw) {
3858 str = g_strdup_printf("Invalid user name: %s", username);
3859 err(str);
3861 if (setgroups(0, NULL)<0) {
3862 err("Could not set groups: %m");
3864 if(setuid(pw->pw_uid)<0) {
3865 err("Could not set UID: %m");
3870 #ifndef ISSERVER
3871 void glib_message_syslog_redirect(const gchar *log_domain,
3872 GLogLevelFlags log_level,
3873 const gchar *message,
3874 gpointer user_data)
3876 int level=LOG_DEBUG;
3878 switch( log_level )
3880 case G_LOG_FLAG_FATAL:
3881 case G_LOG_LEVEL_CRITICAL:
3882 case G_LOG_LEVEL_ERROR:
3883 level=LOG_ERR;
3884 break;
3885 case G_LOG_LEVEL_WARNING:
3886 level=LOG_WARNING;
3887 break;
3888 case G_LOG_LEVEL_MESSAGE:
3889 case G_LOG_LEVEL_INFO:
3890 level=LOG_INFO;
3891 break;
3892 case G_LOG_LEVEL_DEBUG:
3893 level=LOG_DEBUG;
3894 break;
3895 default:
3896 level=LOG_ERR;
3898 syslog(level, "%s", message);
3900 #endif
3903 * Main entry point...
3905 int main(int argc, char *argv[]) {
3906 SERVER *serve;
3907 GArray *servers;
3908 GError *gerr=NULL;
3909 struct generic_conf genconf;
3911 memset(&genconf, 0, sizeof(struct generic_conf));
3913 if (sizeof( struct nbd_request )!=28) {
3914 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
3915 exit(EXIT_FAILURE) ;
3918 modernsocks = g_array_new(FALSE, FALSE, sizeof(int));
3919 childsocks = g_array_new(FALSE, FALSE, sizeof(int));
3921 logging(MY_NAME);
3922 config_file_pos = g_strdup(CFILE);
3923 serve=cmdline(argc, argv, &genconf);
3925 genconf.threads = 4;
3926 servers = parse_cfile(config_file_pos, &genconf, true, &gerr);
3928 /* Update global variables with parsed values. This will be
3929 * removed once we get rid of global configuration variables. */
3930 glob_flags |= genconf.flags;
3932 if(serve) {
3933 g_array_append_val(servers, serve);
3936 if(!servers || !servers->len) {
3937 if(gerr && !(gerr->domain == NBDS_ERR
3938 && gerr->code == NBDS_ERR_CFILE_NOTFOUND)) {
3939 g_warning("Could not parse config file: %s", gerr->message);
3942 if(serve) {
3943 g_warning("Specifying an export on the command line no longer uses the oldstyle protocol.");
3946 if((!serve) && (!servers||!servers->len)) {
3947 if(gerr)
3948 g_message("No configured exports; quitting.");
3949 exit(EXIT_FAILURE);
3951 if (!nodaemon)
3952 daemonize();
3954 setup_servers(servers, genconf.modernaddr, genconf.modernport,
3955 genconf.unixsock, genconf.flags);
3956 dousers(genconf.user, genconf.group);
3958 #if HAVE_GNUTLS
3959 gnutls_global_init();
3960 static gnutls_dh_params_t dh_params;
3961 gnutls_dh_params_init(&dh_params);
3962 gnutls_dh_params_generate2(dh_params,
3963 gnutls_sec_param_to_pk_bits(GNUTLS_PK_DH,
3964 // Renamed in GnuTLS 3.3
3965 #if GNUTLS_VERSION_NUMBER >= 0x030300
3966 GNUTLS_SEC_PARAM_MEDIUM
3967 #else
3968 GNUTLS_SEC_PARAM_NORMAL
3969 #endif
3971 #endif
3973 if((genconf.modernport != NULL) && strcmp(genconf.modernport, "0")==0) {
3974 #ifndef ISSERVER
3975 err("inetd mode requires syslog");
3976 #endif
3977 CLIENT* client = negotiate(0, servers, &genconf);
3978 if(!client) {
3979 exit(EXIT_FAILURE);
3981 tpool = g_thread_pool_new(handle_request, NULL, genconf.threads, FALSE, NULL);
3982 mainloop_threaded(client);
3983 return 0;
3986 serveloop(servers, &genconf);