Remove building with NOCRYPTO option
[minix.git] / external / bsd / bind / dist / lib / isc / win32 / socket.c
blob1af018fc1b990b6e77c34f3560e754285d23b855
1 /* $NetBSD: socket.c,v 1.10 2015/07/08 17:29:00 christos Exp $ */
3 /*
4 * Copyright (C) 2004-2015 Internet Systems Consortium, Inc. ("ISC")
5 * Copyright (C) 2000-2003 Internet Software Consortium.
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
13 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
17 * PERFORMANCE OF THIS SOFTWARE.
20 /* Id */
22 /* This code uses functions which are only available on Server 2003 and
23 * higher, and Windows XP and higher.
25 * This code is by nature multithreaded and takes advantage of various
26 * features to pass on information through the completion port for
27 * when I/O is completed. All sends, receives, accepts, and connects are
28 * completed through the completion port.
30 * The number of Completion Port Worker threads used is the total number
31 * of CPU's + 1. This increases the likelihood that a Worker Thread is
32 * available for processing a completed request.
34 * XXXPDM 5 August, 2002
37 #define MAKE_EXTERNAL 1
38 #include <config.h>
40 #include <sys/types.h>
42 #ifndef _WINSOCKAPI_
43 #define _WINSOCKAPI_ /* Prevent inclusion of winsock.h in windows.h */
44 #endif
46 #include <errno.h>
47 #include <stddef.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <io.h>
52 #include <fcntl.h>
53 #include <process.h>
55 #include <isc/app.h>
56 #include <isc/buffer.h>
57 #include <isc/bufferlist.h>
58 #include <isc/condition.h>
59 #include <isc/list.h>
60 #include <isc/log.h>
61 #include <isc/mem.h>
62 #include <isc/msgs.h>
63 #include <isc/mutex.h>
64 #include <isc/net.h>
65 #include <isc/once.h>
66 #include <isc/os.h>
67 #include <isc/platform.h>
68 #include <isc/print.h>
69 #include <isc/region.h>
70 #include <isc/socket.h>
71 #include <isc/stats.h>
72 #include <isc/strerror.h>
73 #include <isc/syslog.h>
74 #include <isc/task.h>
75 #include <isc/thread.h>
76 #include <isc/util.h>
77 #include <isc/win32os.h>
79 #include <mswsock.h>
81 #include "errno2result.h"
84 * Set by the -T dscp option on the command line. If set to a value
85 * other than -1, we check to make sure DSCP values match it, and
86 * assert if not.
88 int isc_dscp_check_value = -1;
91 * How in the world can Microsoft exist with APIs like this?
92 * We can't actually call this directly, because it turns out
93 * no library exports this function. Instead, we need to
94 * issue a runtime call to get the address.
96 LPFN_CONNECTEX ISCConnectEx;
97 LPFN_ACCEPTEX ISCAcceptEx;
98 LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
101 * Run expensive internal consistency checks.
103 #ifdef ISC_SOCKET_CONSISTENCY_CHECKS
104 #define CONSISTENT(sock) consistent(sock)
105 #else
106 #define CONSISTENT(sock) do {} while (/*CONSTCOND*/0)
107 #endif
108 static void consistent(isc_socket_t *sock);
111 * Define this macro to control the behavior of connection
112 * resets on UDP sockets. See Microsoft KnowledgeBase Article Q263823
113 * for details.
114 * NOTE: This requires that Windows 2000 systems install Service Pack 2
115 * or later.
117 #ifndef SIO_UDP_CONNRESET
118 #define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
119 #endif
122 * Some systems define the socket length argument as an int, some as size_t,
123 * some as socklen_t. This is here so it can be easily changed if needed.
125 #ifndef ISC_SOCKADDR_LEN_T
126 #define ISC_SOCKADDR_LEN_T unsigned int
127 #endif
130 * Define what the possible "soft" errors can be. These are non-fatal returns
131 * of various network related functions, like recv() and so on.
133 #define SOFT_ERROR(e) ((e) == WSAEINTR || \
134 (e) == WSAEWOULDBLOCK || \
135 (e) == EWOULDBLOCK || \
136 (e) == EINTR || \
137 (e) == EAGAIN || \
138 (e) == 0)
141 * Pending errors are not really errors and should be
142 * kept separate
144 #define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
146 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
147 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
148 #define DOIO_HARD 2 /* i/o error, event sent */
149 #define DOIO_EOF 3 /* EOF, no event sent */
150 #define DOIO_PENDING 4 /* status when i/o is in process */
151 #define DOIO_NEEDMORE 5 /* IO was processed, but we need more due to minimum */
153 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
156 * DLVL(90) -- Function entry/exit and other tracing.
157 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
158 * DLVL(60) -- Socket data send/receive
159 * DLVL(50) -- Event tracing, including receiving/sending completion events.
160 * DLVL(20) -- Socket creation/destruction.
162 #define TRACE_LEVEL 90
163 #define CORRECTNESS_LEVEL 70
164 #define IOEVENT_LEVEL 60
165 #define EVENT_LEVEL 50
166 #define CREATION_LEVEL 20
168 #define TRACE DLVL(TRACE_LEVEL)
169 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
170 #define IOEVENT DLVL(IOEVENT_LEVEL)
171 #define EVENT DLVL(EVENT_LEVEL)
172 #define CREATION DLVL(CREATION_LEVEL)
174 typedef isc_event_t intev_t;
177 * Socket State
179 enum {
180 SOCK_INITIALIZED, /* Socket Initialized */
181 SOCK_OPEN, /* Socket opened but nothing yet to do */
182 SOCK_DATA, /* Socket sending or receiving data */
183 SOCK_LISTEN, /* TCP Socket listening for connects */
184 SOCK_ACCEPT, /* TCP socket is waiting to accept */
185 SOCK_CONNECT, /* TCP Socket connecting */
186 SOCK_CLOSED, /* Socket has been closed */
189 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
190 #define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC)
193 * IPv6 control information. If the socket is an IPv6 socket we want
194 * to collect the destination address and interface so the client can
195 * set them on outgoing packets.
197 #ifdef ISC_PLATFORM_HAVEIPV6
198 #ifndef USE_CMSG
199 #define USE_CMSG 1
200 #endif
201 #endif
204 * We really don't want to try and use these control messages. Win32
205 * doesn't have this mechanism before XP.
207 #undef USE_CMSG
210 * Message header for recvmsg and sendmsg calls.
211 * Used value-result for recvmsg, value only for sendmsg.
213 struct msghdr {
214 SOCKADDR_STORAGE to_addr; /* UDP send/recv address */
215 int to_addr_len; /* length of the address */
216 WSABUF *msg_iov; /* scatter/gather array */
217 u_int msg_iovlen; /* # elements in msg_iov */
218 void *msg_control; /* ancillary data, see below */
219 u_int msg_controllen; /* ancillary data buffer len */
220 u_int msg_totallen; /* total length of this message */
221 } msghdr;
224 * The size to raise the receive buffer to.
226 #define RCVBUFSIZE (32*1024)
229 * The number of times a send operation is repeated if the result
230 * is WSAEINTR.
232 #define NRETRIES 10
234 struct isc_socket {
235 /* Not locked. */
236 unsigned int magic;
237 isc_socketmgr_t *manager;
238 isc_mutex_t lock;
239 isc_sockettype_t type;
241 /* Pointers to scatter/gather buffers */
242 WSABUF iov[ISC_SOCKET_MAXSCATTERGATHER];
244 /* Locked by socket lock. */
245 ISC_LINK(isc_socket_t) link;
246 unsigned int references; /* EXTERNAL references */
247 SOCKET fd; /* file handle */
248 int pf; /* protocol family */
249 char name[16];
250 void * tag;
253 * Each recv() call uses this buffer. It is a per-socket receive
254 * buffer that allows us to decouple the system recv() from the
255 * recv_list done events. This means the items on the recv_list
256 * can be removed without having to cancel pending system recv()
257 * calls. It also allows us to read-ahead in some cases.
259 struct {
260 SOCKADDR_STORAGE from_addr; // UDP send/recv address
261 int from_addr_len; // length of the address
262 char *base; // the base of the buffer
263 char *consume_position; // where to start copying data from next
264 unsigned int len; // the actual size of this buffer
265 unsigned int remaining; // the number of bytes remaining
266 } recvbuf;
268 ISC_LIST(isc_socketevent_t) send_list;
269 ISC_LIST(isc_socketevent_t) recv_list;
270 ISC_LIST(isc_socket_newconnev_t) accept_list;
271 isc_socket_connev_t *connect_ev;
273 isc_sockaddr_t address; /* remote address */
275 unsigned int listener : 1, /* listener socket */
276 connected : 1,
277 pending_connect : 1, /* connect pending */
278 bound : 1, /* bound to local addr */
279 dupped : 1; /* created by isc_socket_dup() */
280 unsigned int pending_iocp; /* Should equal the counters below. Debug. */
281 unsigned int pending_recv; /* Number of outstanding recv() calls. */
282 unsigned int pending_send; /* Number of outstanding send() calls. */
283 unsigned int pending_accept; /* Number of outstanding accept() calls. */
284 unsigned int state; /* Socket state. Debugging and consistency checking. */
285 int state_lineno; /* line which last touched state */
288 #define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (/*CONSTCOND*/0)
291 * Buffer structure
293 typedef struct buflist buflist_t;
295 struct buflist {
296 void *buf;
297 unsigned int buflen;
298 ISC_LINK(buflist_t) link;
302 * I/O Completion ports Info structures
305 static HANDLE hHeapHandle = NULL;
306 typedef struct IoCompletionInfo {
307 OVERLAPPED overlapped;
308 isc_socketevent_t *dev; /* send()/recv() done event */
309 isc_socket_connev_t *cdev; /* connect() done event */
310 isc_socket_newconnev_t *adev; /* accept() done event */
311 void *acceptbuffer;
312 DWORD received_bytes;
313 int request_type;
314 struct msghdr messagehdr;
315 ISC_LIST(buflist_t) bufferlist; /*%< list of buffers */
316 } IoCompletionInfo;
319 * Define a maximum number of I/O Completion Port worker threads
320 * to handle the load on the Completion Port. The actual number
321 * used is the number of CPU's + 1.
323 #define MAX_IOCPTHREADS 20
325 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
326 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
328 struct isc_socketmgr {
329 /* Not locked. */
330 unsigned int magic;
331 isc_mem_t *mctx;
332 isc_mutex_t lock;
333 isc_stats_t *stats;
335 /* Locked by manager lock. */
336 ISC_LIST(isc_socket_t) socklist;
337 isc_boolean_t bShutdown;
338 isc_condition_t shutdown_ok;
339 HANDLE hIoCompletionPort;
340 int maxIOCPThreads;
341 HANDLE hIOCPThreads[MAX_IOCPTHREADS];
342 DWORD dwIOCPThreadIds[MAX_IOCPTHREADS];
345 * Debugging.
346 * Modified by InterlockedIncrement() and InterlockedDecrement()
348 LONG totalSockets;
349 LONG iocp_total;
352 enum {
353 SOCKET_RECV,
354 SOCKET_SEND,
355 SOCKET_ACCEPT,
356 SOCKET_CONNECT
360 * send() and recv() iovec counts
362 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
363 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
365 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
366 isc_sockettype_t type,
367 isc_socket_t **socketp,
368 isc_socket_t *dup_socket);
369 static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
370 static void maybe_free_socket(isc_socket_t **, int);
371 static void free_socket(isc_socket_t **, int);
372 static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
373 static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
374 static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
375 static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
376 static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
377 static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
378 static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
379 static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
380 static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
381 static void queue_receive_request(isc_socket_t *sock);
384 * This is used to dump the contents of the sock structure
385 * You should make sure that the sock is locked before
386 * dumping it. Since the code uses simple printf() statements
387 * it should only be used interactively.
389 void
390 sock_dump(isc_socket_t *sock) {
391 isc_socketevent_t *ldev;
392 isc_socket_newconnev_t *ndev;
394 #if 0
395 isc_sockaddr_t addr;
396 char socktext[ISC_SOCKADDR_FORMATSIZE];
397 isc_result_t result;
399 result = isc_socket_getpeername(sock, &addr);
400 if (result == ISC_R_SUCCESS) {
401 isc_sockaddr_format(&addr, socktext, sizeof(socktext));
402 printf("Remote Socket: %s\n", socktext);
404 result = isc_socket_getsockname(sock, &addr);
405 if (result == ISC_R_SUCCESS) {
406 isc_sockaddr_format(&addr, socktext, sizeof(socktext));
407 printf("This Socket: %s\n", socktext);
409 #endif
411 printf("\n\t\tSock Dump\n");
412 printf("\t\tfd: %u\n", sock->fd);
413 printf("\t\treferences: %d\n", sock->references);
414 printf("\t\tpending_accept: %d\n", sock->pending_accept);
415 printf("\t\tconnecting: %d\n", sock->pending_connect);
416 printf("\t\tconnected: %d\n", sock->connected);
417 printf("\t\tbound: %d\n", sock->bound);
418 printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
419 printf("\t\tsocket type: %d\n", sock->type);
421 printf("\n\t\tSock Recv List\n");
422 ldev = ISC_LIST_HEAD(sock->recv_list);
423 while (ldev != NULL) {
424 printf("\t\tdev: %p\n", ldev);
425 ldev = ISC_LIST_NEXT(ldev, ev_link);
428 printf("\n\t\tSock Send List\n");
429 ldev = ISC_LIST_HEAD(sock->send_list);
430 while (ldev != NULL) {
431 printf("\t\tdev: %p\n", ldev);
432 ldev = ISC_LIST_NEXT(ldev, ev_link);
435 printf("\n\t\tSock Accept List\n");
436 ndev = ISC_LIST_HEAD(sock->accept_list);
437 while (ndev != NULL) {
438 printf("\t\tdev: %p\n", ldev);
439 ndev = ISC_LIST_NEXT(ndev, ev_link);
443 static void
444 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
445 isc_logcategory_t *category, isc_logmodule_t *module, int level,
446 isc_msgcat_t *msgcat, int msgset, int message,
447 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
449 /* This function will add an entry to the I/O completion port
450 * that will signal the I/O thread to exit (gracefully)
452 static void
453 signal_iocompletionport_exit(isc_socketmgr_t *manager) {
454 int i;
455 int errval;
456 char strbuf[ISC_STRERRORSIZE];
458 REQUIRE(VALID_MANAGER(manager));
459 for (i = 0; i < manager->maxIOCPThreads; i++) {
460 if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
461 0, 0, 0)) {
462 errval = GetLastError();
463 isc__strerror(errval, strbuf, sizeof(strbuf));
464 FATAL_ERROR(__FILE__, __LINE__,
465 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
466 ISC_MSG_FAILED,
467 "Can't request service thread to exit: %s"),
468 strbuf);
474 * Create the worker threads for the I/O Completion Port
476 void
477 iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
478 int errval;
479 char strbuf[ISC_STRERRORSIZE];
480 int i;
482 INSIST(total_threads > 0);
483 REQUIRE(VALID_MANAGER(manager));
485 * We need at least one
487 for (i = 0; i < total_threads; i++) {
488 manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
489 manager, 0,
490 &manager->dwIOCPThreadIds[i]);
491 if (manager->hIOCPThreads[i] == NULL) {
492 errval = GetLastError();
493 isc__strerror(errval, strbuf, sizeof(strbuf));
494 FATAL_ERROR(__FILE__, __LINE__,
495 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
496 ISC_MSG_FAILED,
497 "Can't create IOCP thread: %s"),
498 strbuf);
499 exit(1);
505 * Create/initialise the I/O completion port
507 void
508 iocompletionport_init(isc_socketmgr_t *manager) {
509 int errval;
510 char strbuf[ISC_STRERRORSIZE];
512 REQUIRE(VALID_MANAGER(manager));
514 * Create a private heap to handle the socket overlapped structure
515 * The minimum number of structures is 10, there is no maximum
517 hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
518 if (hHeapHandle == NULL) {
519 errval = GetLastError();
520 isc__strerror(errval, strbuf, sizeof(strbuf));
521 FATAL_ERROR(__FILE__, __LINE__,
522 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
523 ISC_MSG_FAILED,
524 "HeapCreate() failed during "
525 "initialization: %s"),
526 strbuf);
527 exit(1);
530 manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
532 /* Now Create the Completion Port */
533 manager->hIoCompletionPort = CreateIoCompletionPort(
534 INVALID_HANDLE_VALUE, NULL,
535 0, manager->maxIOCPThreads);
536 if (manager->hIoCompletionPort == NULL) {
537 errval = GetLastError();
538 isc__strerror(errval, strbuf, sizeof(strbuf));
539 FATAL_ERROR(__FILE__, __LINE__,
540 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
541 ISC_MSG_FAILED,
542 "CreateIoCompletionPort() failed "
543 "during initialization: %s"),
544 strbuf);
545 exit(1);
549 * Worker threads for servicing the I/O
551 iocompletionport_createthreads(manager->maxIOCPThreads, manager);
555 * Associate a socket with an IO Completion Port. This allows us to queue events for it
556 * and have our worker pool of threads process them.
558 void
559 iocompletionport_update(isc_socket_t *sock) {
560 HANDLE hiocp;
561 char strbuf[ISC_STRERRORSIZE];
563 REQUIRE(VALID_SOCKET(sock));
565 hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
566 sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
568 if (hiocp == NULL) {
569 DWORD errval = GetLastError();
570 isc__strerror(errval, strbuf, sizeof(strbuf));
571 isc_log_iwrite(isc_lctx,
572 ISC_LOGCATEGORY_GENERAL,
573 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
574 isc_msgcat, ISC_MSGSET_SOCKET,
575 ISC_MSG_TOOMANYHANDLES,
576 "iocompletionport_update: failed to open"
577 " io completion port: %s",
578 strbuf);
580 /* XXXMLG temporary hack to make failures detected.
581 * This function should return errors to the caller, not
582 * exit here.
584 FATAL_ERROR(__FILE__, __LINE__,
585 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
586 ISC_MSG_FAILED,
587 "CreateIoCompletionPort() failed "
588 "during initialization: %s"),
589 strbuf);
590 exit(1);
593 InterlockedIncrement(&sock->manager->iocp_total);
597 * Routine to cleanup and then close the socket.
598 * Only close the socket here if it is NOT associated
599 * with an event, otherwise the WSAWaitForMultipleEvents
600 * may fail due to the fact that the Wait should not
601 * be running while closing an event or a socket.
602 * The socket is locked before calling this function
604 void
605 socket_close(isc_socket_t *sock) {
607 REQUIRE(sock != NULL);
609 if (sock->fd != INVALID_SOCKET) {
610 closesocket(sock->fd);
611 sock->fd = INVALID_SOCKET;
612 _set_state(sock, SOCK_CLOSED);
613 InterlockedDecrement(&sock->manager->totalSockets);
617 static isc_once_t initialise_once = ISC_ONCE_INIT;
618 static isc_boolean_t initialised = ISC_FALSE;
620 static void
621 initialise(void) {
622 WORD wVersionRequested;
623 WSADATA wsaData;
624 int err;
625 SOCKET sock;
626 GUID GUIDConnectEx = WSAID_CONNECTEX;
627 GUID GUIDAcceptEx = WSAID_ACCEPTEX;
628 GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
629 DWORD dwBytes;
631 /* Need Winsock 2.2 or better */
632 wVersionRequested = MAKEWORD(2, 2);
634 err = WSAStartup(wVersionRequested, &wsaData);
635 if (err != 0) {
636 char strbuf[ISC_STRERRORSIZE];
637 isc__strerror(err, strbuf, sizeof(strbuf));
638 FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
639 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
640 ISC_MSG_FAILED, "failed"),
641 strbuf);
642 exit(1);
645 * The following APIs do not exist as functions in a library, but we must
646 * ask winsock for them. They are "extensions" -- but why they cannot be
647 * actual functions is beyond me. So, ask winsock for the pointers to the
648 * functions we need.
650 sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
651 INSIST(sock != INVALID_SOCKET);
652 err = WSAIoctl(sock, SIO_GET_EXTENSION_FUNCTION_POINTER,
653 &GUIDConnectEx, sizeof(GUIDConnectEx),
654 &ISCConnectEx, sizeof(ISCConnectEx),
655 &dwBytes, NULL, NULL);
656 INSIST(err == 0);
658 err = WSAIoctl(sock, SIO_GET_EXTENSION_FUNCTION_POINTER,
659 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
660 &ISCAcceptEx, sizeof(ISCAcceptEx),
661 &dwBytes, NULL, NULL);
662 INSIST(err == 0);
664 err = WSAIoctl(sock, SIO_GET_EXTENSION_FUNCTION_POINTER,
665 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
666 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
667 &dwBytes, NULL, NULL);
668 INSIST(err == 0);
670 closesocket(sock);
672 initialised = ISC_TRUE;
676 * Initialize socket services
678 void
679 InitSockets(void) {
680 RUNTIME_CHECK(isc_once_do(&initialise_once,
681 initialise) == ISC_R_SUCCESS);
682 if (!initialised)
683 exit(1);
687 internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
688 struct msghdr *messagehdr, int flags, int *Error)
690 int Result;
691 DWORD BytesSent;
692 DWORD Flags = flags;
693 int total_sent;
695 *Error = 0;
696 Result = WSASendTo(sock->fd, messagehdr->msg_iov,
697 messagehdr->msg_iovlen, &BytesSent,
698 Flags, (SOCKADDR *)&messagehdr->to_addr,
699 messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
700 NULL);
702 total_sent = (int)BytesSent;
704 /* Check for errors.*/
705 if (Result == SOCKET_ERROR) {
706 *Error = WSAGetLastError();
708 switch (*Error) {
709 case WSA_IO_INCOMPLETE:
710 case WSA_WAIT_IO_COMPLETION:
711 case WSA_IO_PENDING:
712 case NO_ERROR: /* Strange, but okay */
713 sock->pending_iocp++;
714 sock->pending_send++;
715 break;
717 default:
718 return (-1);
719 break;
721 } else {
722 sock->pending_iocp++;
723 sock->pending_send++;
726 if (lpo != NULL)
727 return (0);
728 else
729 return (total_sent);
732 static void
733 queue_receive_request(isc_socket_t *sock) {
734 DWORD Flags = 0;
735 DWORD NumBytes = 0;
736 int Result;
737 int Error;
738 int need_retry;
739 WSABUF iov[1];
740 IoCompletionInfo *lpo = NULL;
741 isc_result_t isc_result;
743 retry:
744 need_retry = ISC_FALSE;
747 * If we already have a receive pending, do nothing.
749 if (sock->pending_recv > 0) {
750 if (lpo != NULL)
751 HeapFree(hHeapHandle, 0, lpo);
752 return;
756 * If no one is waiting, do nothing.
758 if (ISC_LIST_EMPTY(sock->recv_list)) {
759 if (lpo != NULL)
760 HeapFree(hHeapHandle, 0, lpo);
761 return;
764 INSIST(sock->recvbuf.remaining == 0);
765 INSIST(sock->fd != INVALID_SOCKET);
767 iov[0].len = sock->recvbuf.len;
768 iov[0].buf = sock->recvbuf.base;
770 if (lpo == NULL) {
771 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
772 HEAP_ZERO_MEMORY,
773 sizeof(IoCompletionInfo));
774 RUNTIME_CHECK(lpo != NULL);
775 } else
776 ZeroMemory(lpo, sizeof(IoCompletionInfo));
777 lpo->request_type = SOCKET_RECV;
779 sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
781 Error = 0;
782 Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
783 &NumBytes, &Flags,
784 (SOCKADDR *)&sock->recvbuf.from_addr,
785 &sock->recvbuf.from_addr_len,
786 (LPWSAOVERLAPPED)lpo, NULL);
788 /* Check for errors. */
789 if (Result == SOCKET_ERROR) {
790 Error = WSAGetLastError();
792 switch (Error) {
793 case WSA_IO_PENDING:
794 sock->pending_iocp++;
795 sock->pending_recv++;
796 break;
798 /* direct error: no completion event */
799 case ERROR_HOST_UNREACHABLE:
800 case WSAENETRESET:
801 case WSAECONNRESET:
802 if (!sock->connected) {
803 /* soft error */
804 need_retry = ISC_TRUE;
805 break;
807 /* FALLTHROUGH */
809 default:
810 isc_result = isc__errno2result(Error);
811 if (isc_result == ISC_R_UNEXPECTED)
812 UNEXPECTED_ERROR(__FILE__, __LINE__,
813 "WSARecvFrom: Windows error code: %d, isc result %d",
814 Error, isc_result);
815 send_recvdone_abort(sock, isc_result);
816 HeapFree(hHeapHandle, 0, lpo);
817 lpo = NULL;
818 break;
820 } else {
822 * The recv() finished immediately, but we will still get
823 * a completion event. Rather than duplicate code, let
824 * that thread handle sending the data along its way.
826 sock->pending_iocp++;
827 sock->pending_recv++;
830 socket_log(__LINE__, sock, NULL, IOEVENT,
831 isc_msgcat, ISC_MSGSET_SOCKET,
832 ISC_MSG_DOIORECV,
833 "queue_io_request: fd %d result %d error %d",
834 sock->fd, Result, Error);
836 CONSISTENT(sock);
838 if (need_retry)
839 goto retry;
842 static void
843 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
844 isc_logmodule_t *module, int level, const char *fmt, ...)
846 char msgbuf[2048];
847 va_list ap;
849 if (!isc_log_wouldlog(isc_lctx, level))
850 return;
852 va_start(ap, fmt);
853 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
854 va_end(ap);
856 isc_log_write(isc_lctx, category, module, level,
857 "sockmgr %p: %s", sockmgr, msgbuf);
860 static void
861 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
862 isc_logcategory_t *category, isc_logmodule_t *module, int level,
863 isc_msgcat_t *msgcat, int msgset, int message,
864 const char *fmt, ...)
866 char msgbuf[2048];
867 char peerbuf[256];
868 va_list ap;
871 if (!isc_log_wouldlog(isc_lctx, level))
872 return;
874 va_start(ap, fmt);
875 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
876 va_end(ap);
878 if (address == NULL) {
879 isc_log_iwrite(isc_lctx, category, module, level,
880 msgcat, msgset, message,
881 "socket %p line %d: %s", sock, lineno, msgbuf);
882 } else {
883 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
884 isc_log_iwrite(isc_lctx, category, module, level,
885 msgcat, msgset, message,
886 "socket %p line %d peer %s: %s", sock, lineno,
887 peerbuf, msgbuf);
893 * Make an fd SOCKET non-blocking.
895 static isc_result_t
896 make_nonblock(SOCKET fd) {
897 int ret;
898 unsigned long flags = 1;
899 char strbuf[ISC_STRERRORSIZE];
901 /* Set the socket to non-blocking */
902 ret = ioctlsocket(fd, FIONBIO, &flags);
904 if (ret == -1) {
905 isc__strerror(errno, strbuf, sizeof(strbuf));
906 UNEXPECTED_ERROR(__FILE__, __LINE__,
907 "ioctlsocket(%d, FIOBIO, %d): %s",
908 fd, flags, strbuf);
910 return (ISC_R_UNEXPECTED);
913 return (ISC_R_SUCCESS);
917 * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
918 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
919 * fails with an "ICMP port unreachable" response and preventing the
920 * socket from using the WSARecvFrom in subsequent operations.
921 * The function below fixes this, but requires that Windows 2000
922 * Service Pack 2 or later be installed on the system. NT 4.0
923 * systems are not affected by this and work correctly.
924 * See Microsoft Knowledge Base Article Q263823 for details of this.
926 isc_result_t
927 connection_reset_fix(SOCKET fd) {
928 DWORD dwBytesReturned = 0;
929 BOOL bNewBehavior = FALSE;
930 DWORD status;
932 if (isc_win32os_versioncheck(5, 0, 0, 0) < 0)
933 return (ISC_R_SUCCESS); /* NT 4.0 has no problem */
935 /* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
936 status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
937 sizeof(bNewBehavior), NULL, 0,
938 &dwBytesReturned, NULL, NULL);
939 if (status != SOCKET_ERROR)
940 return (ISC_R_SUCCESS);
941 else {
942 UNEXPECTED_ERROR(__FILE__, __LINE__,
943 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
944 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
945 ISC_MSG_FAILED, "failed"));
946 return (ISC_R_UNEXPECTED);
951 * Construct an iov array and attach it to the msghdr passed in. This is
952 * the SEND constructor, which will use the used region of the buffer
953 * (if using a buffer list) or will use the internal region (if a single
954 * buffer I/O is requested).
956 * Nothing can be NULL, and the done event must list at least one buffer
957 * on the buffer linked list for this function to be meaningful.
959 static void
960 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
961 struct msghdr *msg, char *cmsg, WSABUF *iov,
962 IoCompletionInfo *lpo)
964 unsigned int iovcount;
965 isc_buffer_t *buffer;
966 buflist_t *cpbuffer;
967 isc_region_t used;
968 size_t write_count;
969 size_t skip_count;
971 memset(msg, 0, sizeof(*msg));
973 memmove(&msg->to_addr, &dev->address.type, dev->address.length);
974 msg->to_addr_len = dev->address.length;
976 buffer = ISC_LIST_HEAD(dev->bufferlist);
977 write_count = 0;
978 iovcount = 0;
981 * Single buffer I/O? Skip what we've done so far in this region.
983 if (buffer == NULL) {
984 write_count = dev->region.length - dev->n;
985 cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
986 RUNTIME_CHECK(cpbuffer != NULL);
987 cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
988 RUNTIME_CHECK(cpbuffer->buf != NULL);
990 socket_log(__LINE__, sock, NULL, TRACE,
991 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
992 "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
993 cpbuffer->buf, write_count);
995 memmove(cpbuffer->buf,(dev->region.base + dev->n), write_count);
996 cpbuffer->buflen = (unsigned int)write_count;
997 ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
998 iov[0].buf = cpbuffer->buf;
999 iov[0].len = (u_long)write_count;
1000 iovcount = 1;
1002 goto config;
1006 * Multibuffer I/O.
1007 * Skip the data in the buffer list that we have already written.
1009 skip_count = dev->n;
1010 while (buffer != NULL) {
1011 REQUIRE(ISC_BUFFER_VALID(buffer));
1012 if (skip_count < isc_buffer_usedlength(buffer))
1013 break;
1014 skip_count -= isc_buffer_usedlength(buffer);
1015 buffer = ISC_LIST_NEXT(buffer, link);
1018 while (buffer != NULL) {
1019 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1021 isc_buffer_usedregion(buffer, &used);
1023 if (used.length > 0) {
1024 int uselen = (int)(used.length - skip_count);
1025 cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1026 RUNTIME_CHECK(cpbuffer != NULL);
1027 cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1028 RUNTIME_CHECK(cpbuffer->buf != NULL);
1030 socket_log(__LINE__, sock, NULL, TRACE,
1031 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1032 "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1033 cpbuffer->buf, write_count);
1035 memmove(cpbuffer->buf,(used.base + skip_count), uselen);
1036 cpbuffer->buflen = uselen;
1037 iov[iovcount].buf = cpbuffer->buf;
1038 iov[iovcount].len = (u_long)(used.length - skip_count);
1039 write_count += uselen;
1040 skip_count = 0;
1041 iovcount++;
1043 buffer = ISC_LIST_NEXT(buffer, link);
1046 INSIST(skip_count == 0);
1048 config:
1049 msg->msg_iov = iov;
1050 msg->msg_iovlen = iovcount;
1051 msg->msg_totallen = (u_int)write_count;
1054 static void
1055 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1056 isc_socketevent_t *dev)
1058 if (sock->type == isc_sockettype_udp) {
1059 if (address != NULL)
1060 dev->address = *address;
1061 else
1062 dev->address = sock->address;
1063 } else if (sock->type == isc_sockettype_tcp) {
1064 INSIST(address == NULL);
1065 dev->address = sock->address;
1069 static void
1070 destroy_socketevent(isc_event_t *event) {
1071 isc_socketevent_t *ev = (isc_socketevent_t *)event;
1073 INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1075 (ev->destroy)(event);
1078 static isc_socketevent_t *
1079 allocate_socketevent(isc_mem_t *mctx, isc_socket_t *sock,
1080 isc_eventtype_t eventtype, isc_taskaction_t action,
1081 void *arg)
1083 isc_socketevent_t *ev;
1085 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sock, eventtype,
1086 action, arg,
1087 sizeof(*ev));
1088 if (ev == NULL)
1089 return (NULL);
1091 ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1092 ISC_LINK_INIT(ev, ev_link);
1093 ISC_LIST_INIT(ev->bufferlist);
1094 ev->region.base = NULL;
1095 ev->n = 0;
1096 ev->offset = 0;
1097 ev->attributes = 0;
1098 ev->destroy = ev->ev_destroy;
1099 ev->ev_destroy = destroy_socketevent;
1100 ev->dscp = 0;
1102 return (ev);
1105 #if defined(ISC_SOCKET_DEBUG)
1106 static void
1107 dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1108 unsigned int i;
1110 printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1111 printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1112 printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1113 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1114 printf("\t\t%u\tbase %p, len %u\n", i,
1115 msg->msg_iov[i].buf, msg->msg_iov[i].len);
1117 #endif
1120 * map the error code
1123 map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1124 char *errorstring, size_t bufsize) {
1126 int doreturn;
1127 switch (windows_errno) {
1128 case WSAECONNREFUSED:
1129 *isc_errno = ISC_R_CONNREFUSED;
1130 if (sock->connected)
1131 doreturn = DOIO_HARD;
1132 else
1133 doreturn = DOIO_SOFT;
1134 break;
1135 case WSAENETUNREACH:
1136 case ERROR_NETWORK_UNREACHABLE:
1137 *isc_errno = ISC_R_NETUNREACH;
1138 if (sock->connected)
1139 doreturn = DOIO_HARD;
1140 else
1141 doreturn = DOIO_SOFT;
1142 break;
1143 case ERROR_PORT_UNREACHABLE:
1144 case ERROR_HOST_UNREACHABLE:
1145 case WSAEHOSTUNREACH:
1146 *isc_errno = ISC_R_HOSTUNREACH;
1147 if (sock->connected)
1148 doreturn = DOIO_HARD;
1149 else
1150 doreturn = DOIO_SOFT;
1151 break;
1152 case WSAENETDOWN:
1153 *isc_errno = ISC_R_NETDOWN;
1154 if (sock->connected)
1155 doreturn = DOIO_HARD;
1156 else
1157 doreturn = DOIO_SOFT;
1158 break;
1159 case WSAEHOSTDOWN:
1160 *isc_errno = ISC_R_HOSTDOWN;
1161 if (sock->connected)
1162 doreturn = DOIO_HARD;
1163 else
1164 doreturn = DOIO_SOFT;
1165 break;
1166 case WSAEACCES:
1167 *isc_errno = ISC_R_NOPERM;
1168 if (sock->connected)
1169 doreturn = DOIO_HARD;
1170 else
1171 doreturn = DOIO_SOFT;
1172 break;
1173 case WSAECONNRESET:
1174 case WSAENETRESET:
1175 case WSAECONNABORTED:
1176 case WSAEDISCON:
1177 *isc_errno = ISC_R_CONNECTIONRESET;
1178 if (sock->connected)
1179 doreturn = DOIO_HARD;
1180 else
1181 doreturn = DOIO_SOFT;
1182 break;
1183 case WSAENOTCONN:
1184 *isc_errno = ISC_R_NOTCONNECTED;
1185 if (sock->connected)
1186 doreturn = DOIO_HARD;
1187 else
1188 doreturn = DOIO_SOFT;
1189 break;
1190 case ERROR_OPERATION_ABORTED:
1191 case ERROR_CONNECTION_ABORTED:
1192 case ERROR_REQUEST_ABORTED:
1193 *isc_errno = ISC_R_CONNECTIONRESET;
1194 doreturn = DOIO_HARD;
1195 break;
1196 case WSAENOBUFS:
1197 *isc_errno = ISC_R_NORESOURCES;
1198 doreturn = DOIO_HARD;
1199 break;
1200 case WSAEAFNOSUPPORT:
1201 *isc_errno = ISC_R_FAMILYNOSUPPORT;
1202 doreturn = DOIO_HARD;
1203 break;
1204 case WSAEADDRNOTAVAIL:
1205 *isc_errno = ISC_R_ADDRNOTAVAIL;
1206 doreturn = DOIO_HARD;
1207 break;
1208 case WSAEDESTADDRREQ:
1209 *isc_errno = ISC_R_BADADDRESSFORM;
1210 doreturn = DOIO_HARD;
1211 break;
1212 case ERROR_NETNAME_DELETED:
1213 *isc_errno = ISC_R_NETDOWN;
1214 doreturn = DOIO_HARD;
1215 break;
1216 default:
1217 *isc_errno = ISC_R_IOERROR;
1218 doreturn = DOIO_HARD;
1219 break;
1221 if (doreturn == DOIO_HARD) {
1222 isc__strerror(windows_errno, errorstring, bufsize);
1224 return (doreturn);
1227 static void
1228 fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1229 isc_region_t r;
1230 int copylen;
1231 isc_buffer_t *buffer;
1233 INSIST(dev->n < dev->minimum);
1234 INSIST(sock->recvbuf.remaining > 0);
1235 INSIST(sock->pending_recv == 0);
1237 if (sock->type == isc_sockettype_udp) {
1238 dev->address.length = sock->recvbuf.from_addr_len;
1239 memmove(&dev->address.type, &sock->recvbuf.from_addr,
1240 sock->recvbuf.from_addr_len);
1241 if (isc_sockaddr_getport(&dev->address) == 0) {
1242 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1243 socket_log(__LINE__, sock, &dev->address, IOEVENT,
1244 isc_msgcat, ISC_MSGSET_SOCKET,
1245 ISC_MSG_ZEROPORT,
1246 "dropping source port zero packet");
1248 sock->recvbuf.remaining = 0;
1249 return;
1251 } else if (sock->type == isc_sockettype_tcp) {
1252 dev->address = sock->address;
1256 * Run through the list of buffers we were given, and find the
1257 * first one with space. Once it is found, loop through, filling
1258 * the buffers as much as possible.
1260 buffer = ISC_LIST_HEAD(dev->bufferlist);
1261 if (buffer != NULL) { // Multi-buffer receive
1262 while (buffer != NULL && sock->recvbuf.remaining > 0) {
1263 REQUIRE(ISC_BUFFER_VALID(buffer));
1264 if (isc_buffer_availablelength(buffer) > 0) {
1265 isc_buffer_availableregion(buffer, &r);
1266 copylen = min(r.length,
1267 sock->recvbuf.remaining);
1268 memmove(r.base, sock->recvbuf.consume_position,
1269 copylen);
1270 sock->recvbuf.consume_position += copylen;
1271 sock->recvbuf.remaining -= copylen;
1272 isc_buffer_add(buffer, copylen);
1273 dev->n += copylen;
1275 buffer = ISC_LIST_NEXT(buffer, link);
1277 } else { // Single-buffer receive
1278 copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1279 memmove(dev->region.base + dev->n,
1280 sock->recvbuf.consume_position, copylen);
1281 sock->recvbuf.consume_position += copylen;
1282 sock->recvbuf.remaining -= copylen;
1283 dev->n += copylen;
1287 * UDP receives are all-consuming. That is, if we have 4k worth of
1288 * data in our receive buffer, and the caller only gave us
1289 * 1k of space, we will toss the remaining 3k of data. TCP
1290 * will keep the extra data around and use it for later requests.
1292 if (sock->type == isc_sockettype_udp)
1293 sock->recvbuf.remaining = 0;
1297 * Copy out as much data from the internal buffer to done events.
1298 * As each done event is filled, send it along its way.
1300 static void
1301 completeio_recv(isc_socket_t *sock)
1303 isc_socketevent_t *dev;
1306 * If we are in the process of filling our buffer, we cannot
1307 * touch it yet, so don't.
1309 if (sock->pending_recv > 0)
1310 return;
1312 while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1313 dev = ISC_LIST_HEAD(sock->recv_list);
1316 * See if we have sufficient data in our receive buffer
1317 * to handle this. If we do, copy out the data.
1319 fill_recv(sock, dev);
1322 * Did we satisfy it?
1324 if (dev->n >= dev->minimum) {
1325 dev->result = ISC_R_SUCCESS;
1326 send_recvdone_event(sock, &dev);
1332 * Returns:
1333 * DOIO_SUCCESS The operation succeeded. dev->result contains
1334 * ISC_R_SUCCESS.
1336 * DOIO_HARD A hard or unexpected I/O error was encountered.
1337 * dev->result contains the appropriate error.
1339 * DOIO_SOFT A soft I/O error was encountered. No senddone
1340 * event was sent. The operation should be retried.
1342 * No other return values are possible.
1344 static int
1345 completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1346 struct msghdr *messagehdr, int cc, int send_errno)
1348 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1349 char strbuf[ISC_STRERRORSIZE];
1351 if (send_errno != 0) {
1352 if (SOFT_ERROR(send_errno))
1353 return (DOIO_SOFT);
1355 return (map_socket_error(sock, send_errno, &dev->result,
1356 strbuf, sizeof(strbuf)));
1359 * The other error types depend on whether or not the
1360 * socket is UDP or TCP. If it is UDP, some errors
1361 * that we expect to be fatal under TCP are merely
1362 * annoying, and are really soft errors.
1364 * However, these soft errors are still returned as
1365 * a status.
1367 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1368 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1369 UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1370 addrbuf, strbuf);
1371 dev->result = isc__errno2result(send_errno);
1372 return (DOIO_HARD);
1376 * If we write less than we expected, update counters, poke.
1378 dev->n += cc;
1379 if (cc != messagehdr->msg_totallen)
1380 return (DOIO_SOFT);
1383 * Exactly what we wanted to write. We're done with this
1384 * entry. Post its completion event.
1386 dev->result = ISC_R_SUCCESS;
1387 return (DOIO_SUCCESS);
1390 static int
1391 startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1392 int *send_errno)
1394 char *cmsg = NULL;
1395 char strbuf[ISC_STRERRORSIZE];
1396 IoCompletionInfo *lpo;
1397 int status;
1398 struct msghdr *msghdr;
1400 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1401 HEAP_ZERO_MEMORY,
1402 sizeof(IoCompletionInfo));
1403 RUNTIME_CHECK(lpo != NULL);
1404 lpo->request_type = SOCKET_SEND;
1405 lpo->dev = dev;
1406 msghdr = &lpo->messagehdr;
1407 memset(msghdr, 0, sizeof(struct msghdr));
1408 ISC_LIST_INIT(lpo->bufferlist);
1410 build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1412 *nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1414 if (*nbytes <= 0) {
1416 * I/O has been initiated
1417 * completion will be through the completion port
1419 if (PENDING_ERROR(*send_errno)) {
1420 status = DOIO_PENDING;
1421 goto done;
1424 if (SOFT_ERROR(*send_errno)) {
1425 status = DOIO_SOFT;
1426 goto done;
1430 * If we got this far then something is wrong
1432 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1433 isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1434 socket_log(__LINE__, sock, NULL, IOEVENT,
1435 isc_msgcat, ISC_MSGSET_SOCKET,
1436 ISC_MSG_INTERNALSEND,
1437 "startio_send: internal_sendmsg(%d) %d "
1438 "bytes, err %d/%s",
1439 sock->fd, *nbytes, *send_errno, strbuf);
1441 status = DOIO_HARD;
1442 goto done;
1444 dev->result = ISC_R_SUCCESS;
1445 status = DOIO_SOFT;
1446 done:
1447 _set_state(sock, SOCK_DATA);
1448 return (status);
1451 static void
1452 use_min_mtu(isc_socket_t *sock) {
1453 #ifdef IPV6_USE_MIN_MTU
1454 /* use minimum MTU */
1455 if (sock->pf == AF_INET6) {
1456 int on = 1;
1457 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1458 (void *)&on, sizeof(on));
1460 #else
1461 UNUSED(sock);
1462 #endif
1465 static isc_result_t
1466 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1467 isc_socket_t **socketp) {
1468 isc_socket_t *sock;
1469 isc_result_t result;
1471 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1473 if (sock == NULL)
1474 return (ISC_R_NOMEMORY);
1476 sock->magic = 0;
1477 sock->references = 0;
1479 sock->manager = manager;
1480 sock->type = type;
1481 sock->fd = INVALID_SOCKET;
1483 ISC_LINK_INIT(sock, link);
1486 * Set up list of readers and writers to be initially empty.
1488 ISC_LIST_INIT(sock->recv_list);
1489 ISC_LIST_INIT(sock->send_list);
1490 ISC_LIST_INIT(sock->accept_list);
1491 sock->connect_ev = NULL;
1492 sock->pending_accept = 0;
1493 sock->pending_recv = 0;
1494 sock->pending_send = 0;
1495 sock->pending_iocp = 0;
1496 sock->listener = 0;
1497 sock->connected = 0;
1498 sock->pending_connect = 0;
1499 sock->bound = 0;
1500 sock->dupped = 0;
1501 memset(sock->name, 0, sizeof(sock->name)); // zero the name field
1502 _set_state(sock, SOCK_INITIALIZED);
1504 sock->recvbuf.len = 65536;
1505 sock->recvbuf.consume_position = sock->recvbuf.base;
1506 sock->recvbuf.remaining = 0;
1507 sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1508 if (sock->recvbuf.base == NULL) {
1509 result = ISC_R_NOMEMORY;
1510 goto error;
1514 * Initialize the lock.
1516 result = isc_mutex_init(&sock->lock);
1517 if (result != ISC_R_SUCCESS)
1518 goto error;
1520 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1521 "allocated");
1523 sock->magic = SOCKET_MAGIC;
1524 *socketp = sock;
1526 return (ISC_R_SUCCESS);
1528 error:
1529 if (sock->recvbuf.base != NULL)
1530 isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1531 isc_mem_put(manager->mctx, sock, sizeof(*sock));
1533 return (result);
1537 * Verify that the socket state is consistent.
1539 static void
1540 consistent(isc_socket_t *sock) {
1542 isc_socketevent_t *dev;
1543 isc_socket_newconnev_t *nev;
1544 unsigned int count;
1545 char *crash_reason;
1546 isc_boolean_t crash = ISC_FALSE;
1548 REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1549 + sock->pending_accept + sock->pending_connect);
1551 dev = ISC_LIST_HEAD(sock->send_list);
1552 count = 0;
1553 while (dev != NULL) {
1554 count++;
1555 dev = ISC_LIST_NEXT(dev, ev_link);
1557 if (count > sock->pending_send) {
1558 crash = ISC_TRUE;
1559 crash_reason = "send_list > sock->pending_send";
1562 nev = ISC_LIST_HEAD(sock->accept_list);
1563 count = 0;
1564 while (nev != NULL) {
1565 count++;
1566 nev = ISC_LIST_NEXT(nev, ev_link);
1568 if (count > sock->pending_accept) {
1569 crash = ISC_TRUE;
1570 crash_reason = "send_list > sock->pending_send";
1573 if (crash) {
1574 socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1575 ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1576 crash_reason);
1577 sock_dump(sock);
1578 INSIST(crash == ISC_FALSE);
1583 * Maybe free the socket.
1585 * This function will verify tht the socket is no longer in use in any way,
1586 * either internally or externally. This is the only place where this
1587 * check is to be made; if some bit of code believes that IT is done with
1588 * the socket (e.g., some reference counter reaches zero), it should call
1589 * this function.
1591 * When calling this function, the socket must be locked, and the manager
1592 * must be unlocked.
1594 * When this function returns, *socketp will be NULL. No tricks to try
1595 * to hold on to this pointer are allowed.
1597 static void
1598 maybe_free_socket(isc_socket_t **socketp, int lineno) {
1599 isc_socket_t *sock = *socketp;
1600 *socketp = NULL;
1602 INSIST(VALID_SOCKET(sock));
1603 CONSISTENT(sock);
1605 if (sock->pending_iocp > 0
1606 || sock->pending_recv > 0
1607 || sock->pending_send > 0
1608 || sock->pending_accept > 0
1609 || sock->references > 0
1610 || sock->pending_connect == 1
1611 || !ISC_LIST_EMPTY(sock->recv_list)
1612 || !ISC_LIST_EMPTY(sock->send_list)
1613 || !ISC_LIST_EMPTY(sock->accept_list)
1614 || sock->fd != INVALID_SOCKET) {
1615 UNLOCK(&sock->lock);
1616 return;
1618 UNLOCK(&sock->lock);
1620 free_socket(&sock, lineno);
1623 void
1624 free_socket(isc_socket_t **sockp, int lineno) {
1625 isc_socketmgr_t *manager;
1626 isc_socket_t *sock = *sockp;
1627 *sockp = NULL;
1630 * Seems we can free the socket after all.
1632 manager = sock->manager;
1633 socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1634 ISC_MSGSET_SOCKET, ISC_MSG_DESTROYING,
1635 "freeing socket line %d fd %d lock %p semaphore %p",
1636 lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1638 sock->magic = 0;
1639 DESTROYLOCK(&sock->lock);
1641 if (sock->recvbuf.base != NULL)
1642 isc_mem_put(manager->mctx, sock->recvbuf.base,
1643 sock->recvbuf.len);
1645 LOCK(&manager->lock);
1646 if (ISC_LINK_LINKED(sock, link))
1647 ISC_LIST_UNLINK(manager->socklist, sock, link);
1648 isc_mem_put(manager->mctx, sock, sizeof(*sock));
1650 if (ISC_LIST_EMPTY(manager->socklist))
1651 SIGNAL(&manager->shutdown_ok);
1652 UNLOCK(&manager->lock);
1656 * Create a new 'type' socket managed by 'manager'. Events
1657 * will be posted to 'task' and when dispatched 'action' will be
1658 * called with 'arg' as the arg value. The new socket is returned
1659 * in 'socketp'.
1661 static isc_result_t
1662 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1663 isc_socket_t **socketp, isc_socket_t *dup_socket)
1665 isc_socket_t *sock = NULL;
1666 isc_result_t result;
1667 #if defined(USE_CMSG)
1668 int on = 1;
1669 #endif
1670 #if defined(SO_RCVBUF)
1671 ISC_SOCKADDR_LEN_T optlen;
1672 int size;
1673 #endif
1674 int socket_errno;
1675 char strbuf[ISC_STRERRORSIZE];
1677 REQUIRE(VALID_MANAGER(manager));
1678 REQUIRE(socketp != NULL && *socketp == NULL);
1679 REQUIRE(type != isc_sockettype_fdwatch);
1681 #ifndef SOCK_RAW
1682 if (type == isc_sockettype_raw)
1683 return (ISC_R_NOTIMPLEMENTED);
1684 #endif
1686 result = allocate_socket(manager, type, &sock);
1687 if (result != ISC_R_SUCCESS)
1688 return (result);
1690 sock->pf = pf;
1691 switch (type) {
1692 case isc_sockettype_udp:
1693 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1694 if (sock->fd != INVALID_SOCKET) {
1695 result = connection_reset_fix(sock->fd);
1696 if (result != ISC_R_SUCCESS) {
1697 socket_log(__LINE__, sock,
1698 NULL, EVENT, NULL, 0, 0,
1699 "closed %d %d %d "
1700 "con_reset_fix_failed",
1701 sock->pending_recv,
1702 sock->pending_send,
1703 sock->references);
1704 closesocket(sock->fd);
1705 _set_state(sock, SOCK_CLOSED);
1706 sock->fd = INVALID_SOCKET;
1707 free_socket(&sock, __LINE__);
1708 return (result);
1711 break;
1712 case isc_sockettype_tcp:
1713 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1714 break;
1715 #ifdef SOCK_RAW
1716 case isc_sockettype_raw:
1717 sock->fd = socket(pf, SOCK_RAW, 0);
1718 #ifdef PF_ROUTE
1719 if (pf == PF_ROUTE)
1720 sock->bound = 1;
1721 #endif
1722 break;
1723 #endif
1726 if (sock->fd == INVALID_SOCKET) {
1727 socket_errno = WSAGetLastError();
1728 free_socket(&sock, __LINE__);
1730 switch (socket_errno) {
1731 case WSAEMFILE:
1732 case WSAENOBUFS:
1733 return (ISC_R_NORESOURCES);
1735 case WSAEPROTONOSUPPORT:
1736 case WSAEPFNOSUPPORT:
1737 case WSAEAFNOSUPPORT:
1738 return (ISC_R_FAMILYNOSUPPORT);
1740 default:
1741 isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1742 UNEXPECTED_ERROR(__FILE__, __LINE__,
1743 "socket() %s: %s",
1744 isc_msgcat_get(isc_msgcat,
1745 ISC_MSGSET_GENERAL,
1746 ISC_MSG_FAILED,
1747 "failed"),
1748 strbuf);
1749 return (ISC_R_UNEXPECTED);
1753 result = make_nonblock(sock->fd);
1754 if (result != ISC_R_SUCCESS) {
1755 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1756 "closed %d %d %d make_nonblock_failed",
1757 sock->pending_recv, sock->pending_send,
1758 sock->references);
1759 closesocket(sock->fd);
1760 sock->fd = INVALID_SOCKET;
1761 free_socket(&sock, __LINE__);
1762 return (result);
1766 * Use minimum mtu if possible.
1768 use_min_mtu(sock);
1770 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1771 if (type == isc_sockettype_udp) {
1773 #if defined(USE_CMSG)
1774 #if defined(ISC_PLATFORM_HAVEIPV6)
1775 #ifdef IPV6_RECVPKTINFO
1776 /* 2292bis */
1777 if ((pf == AF_INET6)
1778 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1779 (char *)&on, sizeof(on)) < 0)) {
1780 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1781 UNEXPECTED_ERROR(__FILE__, __LINE__,
1782 "setsockopt(%d, IPV6_RECVPKTINFO) "
1783 "%s: %s", sock->fd,
1784 isc_msgcat_get(isc_msgcat,
1785 ISC_MSGSET_GENERAL,
1786 ISC_MSG_FAILED,
1787 "failed"),
1788 strbuf);
1790 #else
1791 /* 2292 */
1792 if ((pf == AF_INET6)
1793 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1794 (char *)&on, sizeof(on)) < 0)) {
1795 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1796 UNEXPECTED_ERROR(__FILE__, __LINE__,
1797 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1798 sock->fd,
1799 isc_msgcat_get(isc_msgcat,
1800 ISC_MSGSET_GENERAL,
1801 ISC_MSG_FAILED,
1802 "failed"),
1803 strbuf);
1805 #endif /* IPV6_RECVPKTINFO */
1806 #endif /* ISC_PLATFORM_HAVEIPV6 */
1807 #endif /* defined(USE_CMSG) */
1809 #if defined(SO_RCVBUF)
1810 optlen = sizeof(size);
1811 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1812 (char *)&size, &optlen) >= 0 &&
1813 size < RCVBUFSIZE) {
1814 size = RCVBUFSIZE;
1815 (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1816 (char *)&size, sizeof(size));
1818 #endif
1821 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1823 _set_state(sock, SOCK_OPEN);
1824 sock->references = 1;
1825 *socketp = sock;
1827 iocompletionport_update(sock);
1829 if (dup_socket) {
1830 #ifndef ISC_ALLOW_MAPPED
1831 isc__socket_ipv6only(sock, ISC_TRUE);
1832 #endif
1834 if (dup_socket->bound) {
1835 isc_sockaddr_t local;
1837 result = isc__socket_getsockname(dup_socket, &local);
1838 if (result != ISC_R_SUCCESS) {
1839 isc_socket_close(sock);
1840 return (result);
1842 result = isc__socket_bind(sock, &local,
1843 ISC_SOCKET_REUSEADDRESS);
1844 if (result != ISC_R_SUCCESS) {
1845 isc_socket_close(sock);
1846 return (result);
1849 sock->dupped = 1;
1853 * Note we don't have to lock the socket like we normally would because
1854 * there are no external references to it yet.
1856 LOCK(&manager->lock);
1857 ISC_LIST_APPEND(manager->socklist, sock, link);
1858 InterlockedIncrement(&manager->totalSockets);
1859 UNLOCK(&manager->lock);
1861 socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1862 ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1863 "created %u type %u", sock->fd, type);
1865 return (ISC_R_SUCCESS);
1868 isc_result_t
1869 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1870 isc_socket_t **socketp)
1872 return (socket_create(manager, pf, type, socketp, NULL));
1875 isc_result_t
1876 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1877 REQUIRE(VALID_SOCKET(sock));
1878 REQUIRE(socketp != NULL && *socketp == NULL);
1880 return (socket_create(sock->manager, sock->pf, sock->type,
1881 socketp, sock));
1884 isc_result_t
1885 isc_socket_open(isc_socket_t *sock) {
1886 REQUIRE(VALID_SOCKET(sock));
1887 REQUIRE(sock->type != isc_sockettype_fdwatch);
1889 return (ISC_R_NOTIMPLEMENTED);
1893 * Attach to a socket. Caller must explicitly detach when it is done.
1895 void
1896 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1897 REQUIRE(VALID_SOCKET(sock));
1898 REQUIRE(socketp != NULL && *socketp == NULL);
1900 LOCK(&sock->lock);
1901 CONSISTENT(sock);
1902 sock->references++;
1903 UNLOCK(&sock->lock);
1905 *socketp = sock;
1909 * Dereference a socket. If this is the last reference to it, clean things
1910 * up by destroying the socket.
1912 void
1913 isc__socket_detach(isc_socket_t **socketp) {
1914 isc_socket_t *sock;
1916 REQUIRE(socketp != NULL);
1917 sock = *socketp;
1918 REQUIRE(VALID_SOCKET(sock));
1919 REQUIRE(sock->type != isc_sockettype_fdwatch);
1921 LOCK(&sock->lock);
1922 CONSISTENT(sock);
1923 REQUIRE(sock->references > 0);
1924 sock->references--;
1926 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1927 "detach_socket %d %d %d",
1928 sock->pending_recv, sock->pending_send,
1929 sock->references);
1931 if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1932 closesocket(sock->fd);
1933 sock->fd = INVALID_SOCKET;
1934 _set_state(sock, SOCK_CLOSED);
1937 maybe_free_socket(&sock, __LINE__);
1939 *socketp = NULL;
1942 isc_result_t
1943 isc_socket_close(isc_socket_t *sock) {
1944 REQUIRE(VALID_SOCKET(sock));
1945 REQUIRE(sock->type != isc_sockettype_fdwatch);
1947 return (ISC_R_NOTIMPLEMENTED);
1951 * Dequeue an item off the given socket's read queue, set the result code
1952 * in the done event to the one provided, and send it to the task it was
1953 * destined for.
1955 * If the event to be sent is on a list, remove it before sending. If
1956 * asked to, send and detach from the task as well.
1958 * Caller must have the socket locked if the event is attached to the socket.
1960 static void
1961 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1962 isc_task_t *task;
1964 task = (*dev)->ev_sender;
1965 (*dev)->ev_sender = sock;
1967 if (ISC_LINK_LINKED(*dev, ev_link))
1968 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1970 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1971 == ISC_SOCKEVENTATTR_ATTACHED)
1972 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1973 else
1974 isc_task_send(task, (isc_event_t **)dev);
1976 CONSISTENT(sock);
1980 * See comments for send_recvdone_event() above.
1982 static void
1983 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1984 isc_task_t *task;
1986 INSIST(dev != NULL && *dev != NULL);
1988 task = (*dev)->ev_sender;
1989 (*dev)->ev_sender = sock;
1991 if (ISC_LINK_LINKED(*dev, ev_link))
1992 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1994 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1995 == ISC_SOCKEVENTATTR_ATTACHED)
1996 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1997 else
1998 isc_task_send(task, (isc_event_t **)dev);
2000 CONSISTENT(sock);
2004 * See comments for send_recvdone_event() above.
2006 static void
2007 send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
2008 isc_task_t *task;
2010 INSIST(adev != NULL && *adev != NULL);
2012 task = (*adev)->ev_sender;
2013 (*adev)->ev_sender = sock;
2015 if (ISC_LINK_LINKED(*adev, ev_link))
2016 ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
2018 isc_task_sendanddetach(&task, (isc_event_t **)adev);
2020 CONSISTENT(sock);
2024 * See comments for send_recvdone_event() above.
2026 static void
2027 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
2028 isc_task_t *task;
2030 INSIST(cdev != NULL && *cdev != NULL);
2032 task = (*cdev)->ev_sender;
2033 (*cdev)->ev_sender = sock;
2035 sock->connect_ev = NULL;
2037 isc_task_sendanddetach(&task, (isc_event_t **)cdev);
2039 CONSISTENT(sock);
2043 * On entry to this function, the event delivered is the internal
2044 * readable event, and the first item on the accept_list should be
2045 * the done event we want to send. If the list is empty, this is a no-op,
2046 * so just close the new connection, unlock, and return.
2048 * Note the socket is locked before entering here
2050 static void
2051 internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2052 isc_socket_newconnev_t *adev;
2053 isc_result_t result = ISC_R_SUCCESS;
2054 isc_socket_t *nsock;
2055 struct sockaddr *localaddr;
2056 int localaddr_len = sizeof(*localaddr);
2057 struct sockaddr *remoteaddr;
2058 int remoteaddr_len = sizeof(*remoteaddr);
2060 INSIST(VALID_SOCKET(sock));
2061 LOCK(&sock->lock);
2062 CONSISTENT(sock);
2064 socket_log(__LINE__, sock, NULL, TRACE,
2065 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066 "internal_accept called");
2068 INSIST(sock->listener);
2070 INSIST(sock->pending_iocp > 0);
2071 sock->pending_iocp--;
2072 INSIST(sock->pending_accept > 0);
2073 sock->pending_accept--;
2075 adev = lpo->adev;
2078 * If the event is no longer in the list we can just return.
2080 if (!acceptdone_is_active(sock, adev))
2081 goto done;
2083 nsock = adev->newsocket;
2086 * Pull off the done event.
2088 ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2091 * Extract the addresses from the socket, copy them into the structure,
2092 * and return the new socket.
2094 ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2095 sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2096 (LPSOCKADDR *)&localaddr, &localaddr_len,
2097 (LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2098 memmove(&adev->address.type, remoteaddr, remoteaddr_len);
2099 adev->address.length = remoteaddr_len;
2100 nsock->address = adev->address;
2101 nsock->pf = adev->address.type.sa.sa_family;
2103 socket_log(__LINE__, nsock, &nsock->address, TRACE,
2104 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2105 "internal_accept parent %p", sock);
2107 result = make_nonblock(adev->newsocket->fd);
2108 INSIST(result == ISC_R_SUCCESS);
2111 * Use minimum mtu if possible.
2113 use_min_mtu(adev->newsocket);
2115 INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2116 (char *)&sock->fd, sizeof(sock->fd)) == 0);
2119 * Hook it up into the manager.
2121 nsock->bound = 1;
2122 nsock->connected = 1;
2123 _set_state(nsock, SOCK_OPEN);
2125 LOCK(&nsock->manager->lock);
2126 ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2127 InterlockedIncrement(&nsock->manager->totalSockets);
2128 UNLOCK(&nsock->manager->lock);
2130 socket_log(__LINE__, sock, &nsock->address, CREATION,
2131 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2132 "accepted_connection new_socket %p fd %d",
2133 nsock, nsock->fd);
2135 adev->result = result;
2136 send_acceptdone_event(sock, &adev);
2138 done:
2139 CONSISTENT(sock);
2140 UNLOCK(&sock->lock);
2142 HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2143 lpo->acceptbuffer = NULL;
2147 * Called when a socket with a pending connect() finishes.
2148 * Note that the socket is locked before entering.
2150 static void
2151 internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2152 isc_socket_connev_t *cdev;
2153 char strbuf[ISC_STRERRORSIZE];
2155 INSIST(VALID_SOCKET(sock));
2157 LOCK(&sock->lock);
2159 INSIST(sock->pending_iocp > 0);
2160 sock->pending_iocp--;
2161 INSIST(sock->pending_connect == 1);
2162 sock->pending_connect = 0;
2165 * Has this event been canceled?
2167 cdev = lpo->cdev;
2168 if (!connectdone_is_active(sock, cdev)) {
2169 sock->pending_connect = 0;
2170 if (sock->fd != INVALID_SOCKET) {
2171 closesocket(sock->fd);
2172 sock->fd = INVALID_SOCKET;
2173 _set_state(sock, SOCK_CLOSED);
2175 CONSISTENT(sock);
2176 UNLOCK(&sock->lock);
2177 return;
2181 * Check possible Windows network event error status here.
2183 if (connect_errno != 0) {
2185 * If the error is SOFT, just try again on this
2186 * fd and pretend nothing strange happened.
2188 if (SOFT_ERROR(connect_errno) ||
2189 connect_errno == WSAEINPROGRESS) {
2190 sock->pending_connect = 1;
2191 CONSISTENT(sock);
2192 UNLOCK(&sock->lock);
2193 return;
2197 * Translate other errors into ISC_R_* flavors.
2199 switch (connect_errno) {
2200 #define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2201 ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2202 ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2203 ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2204 ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2205 ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2206 ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2207 ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2208 ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2209 ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2210 ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2211 ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2212 ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2213 #undef ERROR_MATCH
2214 default:
2215 cdev->result = ISC_R_UNEXPECTED;
2216 isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2217 UNEXPECTED_ERROR(__FILE__, __LINE__,
2218 "internal_connect: connect() %s",
2219 strbuf);
2221 } else {
2222 INSIST(setsockopt(sock->fd, SOL_SOCKET,
2223 SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2224 cdev->result = ISC_R_SUCCESS;
2225 sock->connected = 1;
2226 socket_log(__LINE__, sock, &sock->address, IOEVENT,
2227 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2228 "internal_connect: success");
2231 send_connectdone_event(sock, &cdev);
2233 UNLOCK(&sock->lock);
2237 * Loop through the socket, returning ISC_R_EOF for each done event pending.
2239 static void
2240 send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2241 isc_socketevent_t *dev;
2243 while (!ISC_LIST_EMPTY(sock->recv_list)) {
2244 dev = ISC_LIST_HEAD(sock->recv_list);
2245 dev->result = result;
2246 send_recvdone_event(sock, &dev);
2251 * Take the data we received in our private buffer, and if any recv() calls on
2252 * our list are satisfied, send the corresponding done event.
2254 * If we need more data (there are still items on the recv_list after we consume all
2255 * our data) then arrange for another system recv() call to fill our buffers.
2257 static void
2258 internal_recv(isc_socket_t *sock, int nbytes)
2260 INSIST(VALID_SOCKET(sock));
2262 LOCK(&sock->lock);
2263 CONSISTENT(sock);
2265 socket_log(__LINE__, sock, NULL, IOEVENT,
2266 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2267 "internal_recv: %d bytes received", nbytes);
2270 * If we got here, the I/O operation succeeded. However, we might still have removed this
2271 * event from our notification list (or never placed it on it due to immediate completion.)
2272 * Handle the reference counting here, and handle the cancellation event just after.
2274 INSIST(sock->pending_iocp > 0);
2275 sock->pending_iocp--;
2276 INSIST(sock->pending_recv > 0);
2277 sock->pending_recv--;
2280 * The only way we could have gotten here is that our I/O has successfully completed.
2281 * Update our pointers, and move on. The only odd case here is that we might not
2282 * have received enough data on a TCP stream to satisfy the minimum requirements. If
2283 * this is the case, we will re-issue the recv() call for what we need.
2285 * We do check for a recv() of 0 bytes on a TCP stream. This means the remote end
2286 * has closed.
2288 if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2289 send_recvdone_abort(sock, ISC_R_EOF);
2290 maybe_free_socket(&sock, __LINE__);
2291 return;
2293 sock->recvbuf.remaining = nbytes;
2294 sock->recvbuf.consume_position = sock->recvbuf.base;
2295 completeio_recv(sock);
2298 * If there are more receivers waiting for data, queue another receive
2299 * here.
2301 queue_receive_request(sock);
2304 * Unlock and/or destroy if we are the last thing this socket has left to do.
2306 maybe_free_socket(&sock, __LINE__);
2309 static void
2310 internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2311 struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2313 buflist_t *buffer;
2316 * Find out what socket this is and lock it.
2318 INSIST(VALID_SOCKET(sock));
2320 LOCK(&sock->lock);
2321 CONSISTENT(sock);
2323 socket_log(__LINE__, sock, NULL, IOEVENT,
2324 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2325 "internal_send: task got socket event %p", dev);
2327 buffer = ISC_LIST_HEAD(lpo->bufferlist);
2328 while (buffer != NULL) {
2329 ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2331 socket_log(__LINE__, sock, NULL, TRACE,
2332 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2333 "free_buffer %p %p", buffer, buffer->buf);
2335 HeapFree(hHeapHandle, 0, buffer->buf);
2336 HeapFree(hHeapHandle, 0, buffer);
2337 buffer = ISC_LIST_HEAD(lpo->bufferlist);
2340 INSIST(sock->pending_iocp > 0);
2341 sock->pending_iocp--;
2342 INSIST(sock->pending_send > 0);
2343 sock->pending_send--;
2345 /* If the event is no longer in the list we can just return */
2346 if (!senddone_is_active(sock, dev))
2347 goto done;
2350 * Set the error code and send things on its way.
2352 switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2353 case DOIO_SOFT:
2354 break;
2355 case DOIO_HARD:
2356 case DOIO_SUCCESS:
2357 send_senddone_event(sock, &dev);
2358 break;
2361 done:
2362 maybe_free_socket(&sock, __LINE__);
2366 * These return if the done event passed in is on the list (or for connect, is
2367 * the one we're waiting for. Using these ensures we will not double-send an
2368 * event.
2370 static isc_boolean_t
2371 senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2373 isc_socketevent_t *ldev;
2375 ldev = ISC_LIST_HEAD(sock->send_list);
2376 while (ldev != NULL && ldev != dev)
2377 ldev = ISC_LIST_NEXT(ldev, ev_link);
2379 return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2382 static isc_boolean_t
2383 acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2385 isc_socket_newconnev_t *ldev;
2387 ldev = ISC_LIST_HEAD(sock->accept_list);
2388 while (ldev != NULL && ldev != dev)
2389 ldev = ISC_LIST_NEXT(ldev, ev_link);
2391 return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2394 static isc_boolean_t
2395 connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2397 return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2401 // The Windows network stack seems to have two very distinct paths depending
2402 // on what is installed. Specifically, if something is looking at network
2403 // connections (like an anti-virus or anti-malware application, such as
2404 // McAfee products) Windows may return additional error conditions which
2405 // were not previously returned.
2407 // One specific one is when a TCP SYN scan is used. In this situation,
2408 // Windows responds with the SYN-ACK, but the scanner never responds with
2409 // the 3rd packet, the ACK. Windows consiers this a partially open connection.
2410 // Most Unix networking stacks, and Windows without McAfee installed, will
2411 // not return this to the caller. However, with this product installed,
2412 // Windows returns this as a failed status on the Accept() call. Here, we
2413 // will just re-issue the ISCAcceptEx() call as if nothing had happened.
2415 // This code should only be called when the listening socket has received
2416 // such an error. Additionally, the "parent" socket must be locked.
2417 // Additionally, the lpo argument is re-used here, and must not be freed
2418 // by the caller.
2420 static isc_result_t
2421 restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2423 isc_socket_t *nsock = lpo->adev->newsocket;
2424 SOCKET new_fd;
2427 * AcceptEx() requires we pass in a socket. Note that we carefully
2428 * do not close the previous socket in case of an error message returned by
2429 * our new socket() call. If we return an error here, our caller will
2430 * clean up.
2432 new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2433 if (nsock->fd == INVALID_SOCKET) {
2434 return (ISC_R_FAILURE); // parent will ask windows for error message
2436 closesocket(nsock->fd);
2437 nsock->fd = new_fd;
2439 memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2441 ISCAcceptEx(parent->fd,
2442 nsock->fd, /* Accepted Socket */
2443 lpo->acceptbuffer, /* Buffer for initial Recv */
2444 0, /* Length of Buffer */
2445 sizeof(SOCKADDR_STORAGE) + 16, /* Local address length + 16 */
2446 sizeof(SOCKADDR_STORAGE) + 16, /* Remote address lengh + 16 */
2447 (LPDWORD)&lpo->received_bytes, /* Bytes Recved */
2448 (LPOVERLAPPED)lpo /* Overlapped structure */
2451 InterlockedDecrement(&nsock->manager->iocp_total);
2452 iocompletionport_update(nsock);
2454 return (ISC_R_SUCCESS);
2458 * This is the I/O Completion Port Worker Function. It loops forever
2459 * waiting for I/O to complete and then forwards them for further
2460 * processing. There are a number of these in separate threads.
2462 static isc_threadresult_t WINAPI
2463 SocketIoThread(LPVOID ThreadContext) {
2464 isc_socketmgr_t *manager = ThreadContext;
2465 BOOL bSuccess = FALSE;
2466 DWORD nbytes;
2467 IoCompletionInfo *lpo = NULL;
2468 isc_socket_t *sock = NULL;
2469 int request;
2470 struct msghdr *messagehdr = NULL;
2471 int errval;
2472 char strbuf[ISC_STRERRORSIZE];
2473 int errstatus;
2475 REQUIRE(VALID_MANAGER(manager));
2478 * Set the thread priority high enough so I/O will
2479 * preempt normal recv packet processing, but not
2480 * higher than the timer sync thread.
2482 if (!SetThreadPriority(GetCurrentThread(),
2483 THREAD_PRIORITY_ABOVE_NORMAL)) {
2484 errval = GetLastError();
2485 isc__strerror(errval, strbuf, sizeof(strbuf));
2486 FATAL_ERROR(__FILE__, __LINE__,
2487 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2488 ISC_MSG_FAILED,
2489 "Can't set thread priority: %s"),
2490 strbuf);
2494 * Loop forever waiting on I/O Completions and then processing them
2496 while (TRUE) {
2497 wait_again:
2498 bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2499 &nbytes,
2500 (PULONG_PTR)&sock,
2501 (LPWSAOVERLAPPED *)&lpo,
2502 INFINITE);
2503 if (lpo == NULL) /* Received request to exit */
2504 break;
2506 REQUIRE(VALID_SOCKET(sock));
2508 request = lpo->request_type;
2510 errstatus = 0;
2511 if (!bSuccess) {
2512 isc_result_t isc_result;
2515 * Did the I/O operation complete?
2517 errstatus = GetLastError();
2518 isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2520 LOCK(&sock->lock);
2521 CONSISTENT(sock);
2522 switch (request) {
2523 case SOCKET_RECV:
2524 INSIST(sock->pending_iocp > 0);
2525 sock->pending_iocp--;
2526 INSIST(sock->pending_recv > 0);
2527 sock->pending_recv--;
2528 if (!sock->connected &&
2529 ((errstatus == ERROR_HOST_UNREACHABLE) ||
2530 (errstatus == WSAENETRESET) ||
2531 (errstatus == WSAECONNRESET))) {
2532 /* ignore soft errors */
2533 queue_receive_request(sock);
2534 break;
2536 send_recvdone_abort(sock, isc_result);
2537 if (isc_result == ISC_R_UNEXPECTED) {
2538 UNEXPECTED_ERROR(__FILE__, __LINE__,
2539 "SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2540 errstatus, isc_result);
2542 break;
2544 case SOCKET_SEND:
2545 INSIST(sock->pending_iocp > 0);
2546 sock->pending_iocp--;
2547 INSIST(sock->pending_send > 0);
2548 sock->pending_send--;
2549 if (senddone_is_active(sock, lpo->dev)) {
2550 lpo->dev->result = isc_result;
2551 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2552 "canceled_send");
2553 send_senddone_event(sock, &lpo->dev);
2555 break;
2557 case SOCKET_ACCEPT:
2558 INSIST(sock->pending_iocp > 0);
2559 INSIST(sock->pending_accept > 0);
2561 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2562 "Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2564 if (acceptdone_is_active(sock, lpo->adev)) {
2565 if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2566 UNLOCK(&sock->lock);
2567 goto wait_again;
2568 } else {
2569 errstatus = GetLastError();
2570 isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2571 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2572 "restart_accept() failed: errstatus=%d isc_result=%d",
2573 errstatus, isc_result);
2577 sock->pending_iocp--;
2578 sock->pending_accept--;
2579 if (acceptdone_is_active(sock, lpo->adev)) {
2580 closesocket(lpo->adev->newsocket->fd);
2581 lpo->adev->newsocket->fd = INVALID_SOCKET;
2582 lpo->adev->newsocket->references--;
2583 free_socket(&lpo->adev->newsocket, __LINE__);
2584 lpo->adev->result = isc_result;
2585 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2586 "canceled_accept");
2587 send_acceptdone_event(sock, &lpo->adev);
2589 break;
2591 case SOCKET_CONNECT:
2592 INSIST(sock->pending_iocp > 0);
2593 sock->pending_iocp--;
2594 INSIST(sock->pending_connect == 1);
2595 sock->pending_connect = 0;
2596 if (connectdone_is_active(sock, lpo->cdev)) {
2597 lpo->cdev->result = isc_result;
2598 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2599 "canceled_connect");
2600 send_connectdone_event(sock, &lpo->cdev);
2602 break;
2604 maybe_free_socket(&sock, __LINE__);
2606 if (lpo != NULL)
2607 HeapFree(hHeapHandle, 0, lpo);
2608 continue;
2611 messagehdr = &lpo->messagehdr;
2613 switch (request) {
2614 case SOCKET_RECV:
2615 internal_recv(sock, nbytes);
2616 break;
2617 case SOCKET_SEND:
2618 internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2619 break;
2620 case SOCKET_ACCEPT:
2621 internal_accept(sock, lpo, errstatus);
2622 break;
2623 case SOCKET_CONNECT:
2624 internal_connect(sock, lpo, errstatus);
2625 break;
2628 if (lpo != NULL)
2629 HeapFree(hHeapHandle, 0, lpo);
2633 * Exit Completion Port Thread
2635 manager_log(manager, TRACE,
2636 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637 ISC_MSG_EXITING, "SocketIoThread exiting"));
2638 return ((isc_threadresult_t)0);
2642 * Create a new socket manager.
2644 isc_result_t
2645 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2646 return (isc_socketmgr_create2(mctx, managerp, 0));
2649 isc_result_t
2650 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2651 unsigned int maxsocks)
2653 isc_socketmgr_t *manager;
2654 isc_result_t result;
2656 REQUIRE(managerp != NULL && *managerp == NULL);
2658 if (maxsocks != 0)
2659 return (ISC_R_NOTIMPLEMENTED);
2661 manager = isc_mem_get(mctx, sizeof(*manager));
2662 if (manager == NULL)
2663 return (ISC_R_NOMEMORY);
2665 InitSockets();
2667 manager->magic = SOCKET_MANAGER_MAGIC;
2668 manager->mctx = NULL;
2669 manager->stats = NULL;
2670 ISC_LIST_INIT(manager->socklist);
2671 result = isc_mutex_init(&manager->lock);
2672 if (result != ISC_R_SUCCESS) {
2673 isc_mem_put(mctx, manager, sizeof(*manager));
2674 return (result);
2676 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2677 DESTROYLOCK(&manager->lock);
2678 isc_mem_put(mctx, manager, sizeof(*manager));
2679 UNEXPECTED_ERROR(__FILE__, __LINE__,
2680 "isc_condition_init() %s",
2681 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2682 ISC_MSG_FAILED, "failed"));
2683 return (ISC_R_UNEXPECTED);
2686 isc_mem_attach(mctx, &manager->mctx);
2688 iocompletionport_init(manager); /* Create the Completion Ports */
2690 manager->bShutdown = ISC_FALSE;
2691 manager->totalSockets = 0;
2692 manager->iocp_total = 0;
2694 *managerp = manager;
2696 return (ISC_R_SUCCESS);
2699 isc_result_t
2700 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2701 REQUIRE(VALID_MANAGER(manager));
2702 REQUIRE(nsockp != NULL);
2704 return (ISC_R_NOTIMPLEMENTED);
2707 void
2708 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2709 REQUIRE(VALID_MANAGER(manager));
2710 REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2711 REQUIRE(manager->stats == NULL);
2712 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2714 isc_stats_attach(stats, &manager->stats);
2717 void
2718 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2719 isc_socketmgr_t *manager;
2720 int i;
2721 isc_mem_t *mctx;
2724 * Destroy a socket manager.
2727 REQUIRE(managerp != NULL);
2728 manager = *managerp;
2729 REQUIRE(VALID_MANAGER(manager));
2731 LOCK(&manager->lock);
2734 * Wait for all sockets to be destroyed.
2736 while (!ISC_LIST_EMPTY(manager->socklist)) {
2737 manager_log(manager, CREATION,
2738 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2739 ISC_MSG_SOCKETSREMAIN,
2740 "sockets exist"));
2741 WAIT(&manager->shutdown_ok, &manager->lock);
2744 UNLOCK(&manager->lock);
2747 * Here, we need to had some wait code for the completion port
2748 * thread.
2750 signal_iocompletionport_exit(manager);
2751 manager->bShutdown = ISC_TRUE;
2754 * Wait for threads to exit.
2756 for (i = 0; i < manager->maxIOCPThreads; i++) {
2757 if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2758 NULL) != ISC_R_SUCCESS)
2759 UNEXPECTED_ERROR(__FILE__, __LINE__,
2760 "isc_thread_join() for Completion Port %s",
2761 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2762 ISC_MSG_FAILED, "failed"));
2765 * Clean up.
2768 CloseHandle(manager->hIoCompletionPort);
2770 (void)isc_condition_destroy(&manager->shutdown_ok);
2772 DESTROYLOCK(&manager->lock);
2773 if (manager->stats != NULL)
2774 isc_stats_detach(&manager->stats);
2775 manager->magic = 0;
2776 mctx= manager->mctx;
2777 isc_mem_put(mctx, manager, sizeof(*manager));
2779 isc_mem_detach(&mctx);
2781 *managerp = NULL;
2784 static void
2785 queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2787 isc_task_t *ntask = NULL;
2789 isc_task_attach(task, &ntask);
2790 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2793 * Enqueue the request.
2795 INSIST(!ISC_LINK_LINKED(dev, ev_link));
2796 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2798 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2799 "queue_receive_event: event %p -> task %p",
2800 dev, ntask);
2804 * Check the pending receive queue, and if we have data pending, give it to this
2805 * caller. If we have none, queue an I/O request. If this caller is not the first
2806 * on the list, then we will just queue this event and return.
2808 * Caller must have the socket locked.
2810 static isc_result_t
2811 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2812 unsigned int flags)
2814 isc_result_t result = ISC_R_SUCCESS;
2816 dev->ev_sender = task;
2818 if (sock->fd == INVALID_SOCKET)
2819 return (ISC_R_EOF);
2822 * Queue our event on the list of things to do. Call our function to
2823 * attempt to fill buffers as much as possible, and return done events.
2824 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2825 * here and tell our caller that we could not satisfy it immediately.
2827 queue_receive_event(sock, task, dev);
2828 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2829 result = ISC_R_INPROGRESS;
2831 completeio_recv(sock);
2834 * If there are more receivers waiting for data, queue another receive
2835 * here. If the
2837 queue_receive_request(sock);
2839 return (result);
2842 isc_result_t
2843 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2844 unsigned int minimum, isc_task_t *task,
2845 isc_taskaction_t action, void *arg)
2847 isc_socketevent_t *dev;
2848 isc_socketmgr_t *manager;
2849 unsigned int iocount;
2850 isc_buffer_t *buffer;
2851 isc_result_t ret;
2853 REQUIRE(VALID_SOCKET(sock));
2854 LOCK(&sock->lock);
2855 CONSISTENT(sock);
2858 * Make sure that the socket is not closed. XXXMLG change error here?
2860 if (sock->fd == INVALID_SOCKET) {
2861 UNLOCK(&sock->lock);
2862 return (ISC_R_CONNREFUSED);
2865 REQUIRE(buflist != NULL);
2866 REQUIRE(!ISC_LIST_EMPTY(*buflist));
2867 REQUIRE(task != NULL);
2868 REQUIRE(action != NULL);
2870 manager = sock->manager;
2871 REQUIRE(VALID_MANAGER(manager));
2873 iocount = isc_bufferlist_availablecount(buflist);
2874 REQUIRE(iocount > 0);
2876 INSIST(sock->bound);
2878 dev = allocate_socketevent(manager->mctx, sock,
2879 ISC_SOCKEVENT_RECVDONE, action, arg);
2880 if (dev == NULL) {
2881 UNLOCK(&sock->lock);
2882 return (ISC_R_NOMEMORY);
2886 * UDP sockets are always partial read
2888 if (sock->type == isc_sockettype_udp)
2889 dev->minimum = 1;
2890 else {
2891 if (minimum == 0)
2892 dev->minimum = iocount;
2893 else
2894 dev->minimum = minimum;
2898 * Move each buffer from the passed in list to our internal one.
2900 buffer = ISC_LIST_HEAD(*buflist);
2901 while (buffer != NULL) {
2902 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2903 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2904 buffer = ISC_LIST_HEAD(*buflist);
2907 ret = socket_recv(sock, dev, task, 0);
2909 UNLOCK(&sock->lock);
2910 return (ret);
2913 isc_result_t
2914 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2915 unsigned int minimum, isc_task_t *task,
2916 isc_taskaction_t action, void *arg)
2918 isc_socketevent_t *dev;
2919 isc_socketmgr_t *manager;
2920 isc_result_t ret;
2922 REQUIRE(VALID_SOCKET(sock));
2923 LOCK(&sock->lock);
2924 CONSISTENT(sock);
2927 * make sure that the socket's not closed
2929 if (sock->fd == INVALID_SOCKET) {
2930 UNLOCK(&sock->lock);
2931 return (ISC_R_CONNREFUSED);
2933 REQUIRE(action != NULL);
2935 manager = sock->manager;
2936 REQUIRE(VALID_MANAGER(manager));
2938 INSIST(sock->bound);
2940 dev = allocate_socketevent(manager->mctx, sock,
2941 ISC_SOCKEVENT_RECVDONE, action, arg);
2942 if (dev == NULL) {
2943 UNLOCK(&sock->lock);
2944 return (ISC_R_NOMEMORY);
2947 ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2948 UNLOCK(&sock->lock);
2949 return (ret);
2952 isc_result_t
2953 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2954 unsigned int minimum, isc_task_t *task,
2955 isc_socketevent_t *event, unsigned int flags)
2957 isc_result_t ret;
2959 REQUIRE(VALID_SOCKET(sock));
2960 LOCK(&sock->lock);
2961 CONSISTENT(sock);
2963 event->result = ISC_R_UNEXPECTED;
2964 event->ev_sender = sock;
2966 * make sure that the socket's not closed
2968 if (sock->fd == INVALID_SOCKET) {
2969 UNLOCK(&sock->lock);
2970 return (ISC_R_CONNREFUSED);
2973 ISC_LIST_INIT(event->bufferlist);
2974 event->region = *region;
2975 event->n = 0;
2976 event->offset = 0;
2977 event->attributes = 0;
2980 * UDP sockets are always partial read.
2982 if (sock->type == isc_sockettype_udp)
2983 event->minimum = 1;
2984 else {
2985 if (minimum == 0)
2986 event->minimum = region->length;
2987 else
2988 event->minimum = minimum;
2991 ret = socket_recv(sock, event, task, flags);
2992 UNLOCK(&sock->lock);
2993 return (ret);
2997 * Caller must have the socket locked.
2999 static isc_result_t
3000 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3001 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3002 unsigned int flags)
3004 int io_state;
3005 int send_errno = 0;
3006 int cc = 0;
3007 isc_task_t *ntask = NULL;
3008 isc_result_t result = ISC_R_SUCCESS;
3010 dev->ev_sender = task;
3012 set_dev_address(address, sock, dev);
3013 if (pktinfo != NULL) {
3014 socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
3015 ISC_MSG_PKTINFOPROVIDED,
3016 "pktinfo structure provided, ifindex %u (set to 0)",
3017 pktinfo->ipi6_ifindex);
3019 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
3020 dev->pktinfo = *pktinfo;
3022 * Set the pktinfo index to 0 here, to let the kernel decide
3023 * what interface it should send on.
3025 dev->pktinfo.ipi6_ifindex = 0;
3028 io_state = startio_send(sock, dev, &cc, &send_errno);
3029 switch (io_state) {
3030 case DOIO_PENDING: /* I/O started. Enqueue completion event. */
3031 case DOIO_SOFT:
3033 * We couldn't send all or part of the request right now, so
3034 * queue it unless ISC_SOCKFLAG_NORETRY is set.
3036 if ((flags & ISC_SOCKFLAG_NORETRY) == 0 ||
3037 io_state == DOIO_PENDING) {
3038 isc_task_attach(task, &ntask);
3039 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3042 * Enqueue the request.
3044 INSIST(!ISC_LINK_LINKED(dev, ev_link));
3045 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3047 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3048 "socket_send: event %p -> task %p",
3049 dev, ntask);
3051 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3052 result = ISC_R_INPROGRESS;
3053 break;
3056 case DOIO_SUCCESS:
3057 break;
3060 return (result);
3063 isc_result_t
3064 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
3065 isc_task_t *task, isc_taskaction_t action, void *arg)
3068 * REQUIRE() checking is performed in isc_socket_sendto().
3070 return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3071 NULL));
3074 isc_result_t
3075 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3076 isc_task_t *task, isc_taskaction_t action, void *arg,
3077 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3079 isc_socketevent_t *dev;
3080 isc_socketmgr_t *manager;
3081 isc_result_t ret;
3083 REQUIRE(VALID_SOCKET(sock));
3084 REQUIRE(sock->type != isc_sockettype_fdwatch);
3086 LOCK(&sock->lock);
3087 CONSISTENT(sock);
3090 * make sure that the socket's not closed
3092 if (sock->fd == INVALID_SOCKET) {
3093 UNLOCK(&sock->lock);
3094 return (ISC_R_CONNREFUSED);
3096 REQUIRE(region != NULL);
3097 REQUIRE(task != NULL);
3098 REQUIRE(action != NULL);
3100 manager = sock->manager;
3101 REQUIRE(VALID_MANAGER(manager));
3103 INSIST(sock->bound);
3105 dev = allocate_socketevent(manager->mctx, sock,
3106 ISC_SOCKEVENT_SENDDONE, action, arg);
3107 if (dev == NULL) {
3108 UNLOCK(&sock->lock);
3109 return (ISC_R_NOMEMORY);
3111 dev->region = *region;
3113 ret = socket_send(sock, dev, task, address, pktinfo, 0);
3114 UNLOCK(&sock->lock);
3115 return (ret);
3118 isc_result_t
3119 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3120 isc_task_t *task, isc_taskaction_t action, void *arg)
3122 return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
3123 NULL, 0));
3126 isc_result_t
3127 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3128 isc_task_t *task, isc_taskaction_t action, void *arg,
3129 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3131 return (isc_socket_sendtov2(sock, buflist, task, action, arg, address,
3132 pktinfo, 0));
3135 isc_result_t
3136 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
3137 isc_task_t *task, isc_taskaction_t action, void *arg,
3138 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3139 unsigned int flags)
3141 isc_socketevent_t *dev;
3142 isc_socketmgr_t *manager;
3143 unsigned int iocount;
3144 isc_buffer_t *buffer;
3145 isc_result_t ret;
3147 REQUIRE(VALID_SOCKET(sock));
3149 LOCK(&sock->lock);
3150 CONSISTENT(sock);
3153 * make sure that the socket's not closed
3155 if (sock->fd == INVALID_SOCKET) {
3156 UNLOCK(&sock->lock);
3157 return (ISC_R_CONNREFUSED);
3159 REQUIRE(buflist != NULL);
3160 REQUIRE(!ISC_LIST_EMPTY(*buflist));
3161 REQUIRE(task != NULL);
3162 REQUIRE(action != NULL);
3164 manager = sock->manager;
3165 REQUIRE(VALID_MANAGER(manager));
3167 iocount = isc_bufferlist_usedcount(buflist);
3168 REQUIRE(iocount > 0);
3170 dev = allocate_socketevent(manager->mctx, sock,
3171 ISC_SOCKEVENT_SENDDONE, action, arg);
3172 if (dev == NULL) {
3173 UNLOCK(&sock->lock);
3174 return (ISC_R_NOMEMORY);
3178 * Move each buffer from the passed in list to our internal one.
3180 buffer = ISC_LIST_HEAD(*buflist);
3181 while (buffer != NULL) {
3182 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3183 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3184 buffer = ISC_LIST_HEAD(*buflist);
3187 ret = socket_send(sock, dev, task, address, pktinfo, flags);
3188 UNLOCK(&sock->lock);
3189 return (ret);
3192 isc_result_t
3193 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3194 isc_task_t *task,
3195 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3196 isc_socketevent_t *event, unsigned int flags)
3198 isc_result_t ret;
3200 REQUIRE(VALID_SOCKET(sock));
3201 LOCK(&sock->lock);
3202 CONSISTENT(sock);
3204 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3205 if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3206 REQUIRE(sock->type == isc_sockettype_udp);
3207 event->ev_sender = sock;
3208 event->result = ISC_R_UNEXPECTED;
3210 * make sure that the socket's not closed
3212 if (sock->fd == INVALID_SOCKET) {
3213 UNLOCK(&sock->lock);
3214 return (ISC_R_CONNREFUSED);
3216 ISC_LIST_INIT(event->bufferlist);
3217 event->region = *region;
3218 event->n = 0;
3219 event->offset = 0;
3220 event->attributes = 0;
3222 ret = socket_send(sock, event, task, address, pktinfo, flags);
3223 UNLOCK(&sock->lock);
3224 return (ret);
3227 isc_result_t
3228 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3229 unsigned int options) {
3230 int bind_errno;
3231 char strbuf[ISC_STRERRORSIZE];
3232 int on = 1;
3234 REQUIRE(VALID_SOCKET(sock));
3235 LOCK(&sock->lock);
3236 CONSISTENT(sock);
3239 * make sure that the socket's not closed
3241 if (sock->fd == INVALID_SOCKET) {
3242 UNLOCK(&sock->lock);
3243 return (ISC_R_CONNREFUSED);
3246 INSIST(!sock->bound);
3247 INSIST(!sock->dupped);
3249 if (sock->pf != sockaddr->type.sa.sa_family) {
3250 UNLOCK(&sock->lock);
3251 return (ISC_R_FAMILYMISMATCH);
3254 * Only set SO_REUSEADDR when we want a specific port.
3256 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3257 isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3258 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3259 sizeof(on)) < 0) {
3260 UNEXPECTED_ERROR(__FILE__, __LINE__,
3261 "setsockopt(%d) %s", sock->fd,
3262 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3263 ISC_MSG_FAILED, "failed"));
3264 /* Press on... */
3266 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3267 bind_errno = WSAGetLastError();
3268 UNLOCK(&sock->lock);
3269 switch (bind_errno) {
3270 case WSAEACCES:
3271 return (ISC_R_NOPERM);
3272 case WSAEADDRNOTAVAIL:
3273 return (ISC_R_ADDRNOTAVAIL);
3274 case WSAEADDRINUSE:
3275 return (ISC_R_ADDRINUSE);
3276 case WSAEINVAL:
3277 return (ISC_R_BOUND);
3278 default:
3279 isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3280 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3281 strbuf);
3282 return (ISC_R_UNEXPECTED);
3286 socket_log(__LINE__, sock, sockaddr, TRACE,
3287 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3288 sock->bound = 1;
3290 UNLOCK(&sock->lock);
3291 return (ISC_R_SUCCESS);
3294 isc_result_t
3295 isc__socket_filter(isc_socket_t *sock, const char *filter) {
3296 UNUSED(sock);
3297 UNUSED(filter);
3299 REQUIRE(VALID_SOCKET(sock));
3300 return (ISC_R_NOTIMPLEMENTED);
3304 * Set up to listen on a given socket. We do this by creating an internal
3305 * event that will be dispatched when the socket has read activity. The
3306 * watcher will send the internal event to the task when there is a new
3307 * connection.
3309 * Unlike in read, we don't preallocate a done event here. Every time there
3310 * is a new connection we'll have to allocate a new one anyway, so we might
3311 * as well keep things simple rather than having to track them.
3313 isc_result_t
3314 isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3315 char strbuf[ISC_STRERRORSIZE];
3317 REQUIRE(VALID_SOCKET(sock));
3319 LOCK(&sock->lock);
3320 CONSISTENT(sock);
3323 * make sure that the socket's not closed
3325 if (sock->fd == INVALID_SOCKET) {
3326 UNLOCK(&sock->lock);
3327 return (ISC_R_CONNREFUSED);
3330 REQUIRE(!sock->listener);
3331 REQUIRE(sock->bound);
3332 REQUIRE(sock->type == isc_sockettype_tcp);
3334 if (backlog == 0)
3335 backlog = SOMAXCONN;
3337 if (listen(sock->fd, (int)backlog) < 0) {
3338 UNLOCK(&sock->lock);
3339 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3341 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3343 return (ISC_R_UNEXPECTED);
3346 socket_log(__LINE__, sock, NULL, TRACE,
3347 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3348 sock->listener = 1;
3349 _set_state(sock, SOCK_LISTEN);
3351 UNLOCK(&sock->lock);
3352 return (ISC_R_SUCCESS);
3356 * This should try to do aggressive accept() XXXMLG
3358 isc_result_t
3359 isc__socket_accept(isc_socket_t *sock,
3360 isc_task_t *task, isc_taskaction_t action, void *arg)
3362 isc_socket_newconnev_t *adev;
3363 isc_socketmgr_t *manager;
3364 isc_task_t *ntask = NULL;
3365 isc_socket_t *nsock;
3366 isc_result_t result;
3367 IoCompletionInfo *lpo;
3369 REQUIRE(VALID_SOCKET(sock));
3371 manager = sock->manager;
3372 REQUIRE(VALID_MANAGER(manager));
3374 LOCK(&sock->lock);
3375 CONSISTENT(sock);
3378 * make sure that the socket's not closed
3380 if (sock->fd == INVALID_SOCKET) {
3381 UNLOCK(&sock->lock);
3382 return (ISC_R_CONNREFUSED);
3385 REQUIRE(sock->listener);
3388 * Sender field is overloaded here with the task we will be sending
3389 * this event to. Just before the actual event is delivered the
3390 * actual ev_sender will be touched up to be the socket.
3392 adev = (isc_socket_newconnev_t *)
3393 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3394 action, arg, sizeof(*adev));
3395 if (adev == NULL) {
3396 UNLOCK(&sock->lock);
3397 return (ISC_R_NOMEMORY);
3399 ISC_LINK_INIT(adev, ev_link);
3401 result = allocate_socket(manager, sock->type, &nsock);
3402 if (result != ISC_R_SUCCESS) {
3403 isc_event_free((isc_event_t **)&adev);
3404 UNLOCK(&sock->lock);
3405 return (result);
3409 * AcceptEx() requires we pass in a socket.
3411 nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3412 if (nsock->fd == INVALID_SOCKET) {
3413 free_socket(&nsock, __LINE__);
3414 isc_event_free((isc_event_t **)&adev);
3415 UNLOCK(&sock->lock);
3416 return (ISC_R_FAILURE); // XXXMLG need real error message
3420 * Attach to socket and to task.
3422 isc_task_attach(task, &ntask);
3423 if (isc_task_exiting(ntask)) {
3424 free_socket(&nsock, __LINE__);
3425 isc_task_detach(&ntask);
3426 isc_event_free(ISC_EVENT_PTR(&adev));
3427 UNLOCK(&sock->lock);
3428 return (ISC_R_SHUTTINGDOWN);
3430 nsock->references++;
3432 adev->ev_sender = ntask;
3433 adev->newsocket = nsock;
3434 _set_state(nsock, SOCK_ACCEPT);
3437 * Queue io completion for an accept().
3439 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3440 HEAP_ZERO_MEMORY,
3441 sizeof(IoCompletionInfo));
3442 RUNTIME_CHECK(lpo != NULL);
3443 lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3444 (sizeof(SOCKADDR_STORAGE) + 16) * 2);
3445 RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3447 lpo->adev = adev;
3448 lpo->request_type = SOCKET_ACCEPT;
3450 ISCAcceptEx(sock->fd,
3451 nsock->fd, /* Accepted Socket */
3452 lpo->acceptbuffer, /* Buffer for initial Recv */
3453 0, /* Length of Buffer */
3454 sizeof(SOCKADDR_STORAGE) + 16, /* Local address length + 16 */
3455 sizeof(SOCKADDR_STORAGE) + 16, /* Remote address lengh + 16 */
3456 (LPDWORD)&lpo->received_bytes, /* Bytes Recved */
3457 (LPOVERLAPPED)lpo /* Overlapped structure */
3459 iocompletionport_update(nsock);
3461 socket_log(__LINE__, sock, NULL, TRACE,
3462 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3463 "accepting for nsock %p fd %d", nsock, nsock->fd);
3466 * Enqueue the event
3468 ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3469 sock->pending_accept++;
3470 sock->pending_iocp++;
3472 UNLOCK(&sock->lock);
3473 return (ISC_R_SUCCESS);
3476 isc_result_t
3477 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3478 isc_task_t *task, isc_taskaction_t action, void *arg)
3480 char strbuf[ISC_STRERRORSIZE];
3481 isc_socket_connev_t *cdev;
3482 isc_task_t *ntask = NULL;
3483 isc_socketmgr_t *manager;
3484 IoCompletionInfo *lpo;
3485 int bind_errno;
3487 REQUIRE(VALID_SOCKET(sock));
3488 REQUIRE(addr != NULL);
3489 REQUIRE(task != NULL);
3490 REQUIRE(action != NULL);
3492 manager = sock->manager;
3493 REQUIRE(VALID_MANAGER(manager));
3494 REQUIRE(addr != NULL);
3496 if (isc_sockaddr_ismulticast(addr))
3497 return (ISC_R_MULTICAST);
3499 LOCK(&sock->lock);
3500 CONSISTENT(sock);
3503 * make sure that the socket's not closed
3505 if (sock->fd == INVALID_SOCKET) {
3506 UNLOCK(&sock->lock);
3507 return (ISC_R_CONNREFUSED);
3511 * Windows sockets won't connect unless the socket is bound.
3513 if (!sock->bound) {
3514 isc_sockaddr_t any;
3516 isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3517 if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3518 bind_errno = WSAGetLastError();
3519 UNLOCK(&sock->lock);
3520 switch (bind_errno) {
3521 case WSAEACCES:
3522 return (ISC_R_NOPERM);
3523 case WSAEADDRNOTAVAIL:
3524 return (ISC_R_ADDRNOTAVAIL);
3525 case WSAEADDRINUSE:
3526 return (ISC_R_ADDRINUSE);
3527 case WSAEINVAL:
3528 return (ISC_R_BOUND);
3529 default:
3530 isc__strerror(bind_errno, strbuf,
3531 sizeof(strbuf));
3532 UNEXPECTED_ERROR(__FILE__, __LINE__,
3533 "bind: %s", strbuf);
3534 return (ISC_R_UNEXPECTED);
3537 sock->bound = 1;
3540 REQUIRE(!sock->pending_connect);
3542 cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3543 ISC_SOCKEVENT_CONNECT,
3544 action, arg,
3545 sizeof(*cdev));
3546 if (cdev == NULL) {
3547 UNLOCK(&sock->lock);
3548 return (ISC_R_NOMEMORY);
3550 ISC_LINK_INIT(cdev, ev_link);
3552 if (sock->type == isc_sockettype_tcp) {
3554 * Queue io completion for an accept().
3556 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3557 HEAP_ZERO_MEMORY,
3558 sizeof(IoCompletionInfo));
3559 lpo->cdev = cdev;
3560 lpo->request_type = SOCKET_CONNECT;
3562 sock->address = *addr;
3563 ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3564 NULL, 0, NULL, (LPOVERLAPPED)lpo);
3567 * Attach to task.
3569 isc_task_attach(task, &ntask);
3570 cdev->ev_sender = ntask;
3572 sock->pending_connect = 1;
3573 _set_state(sock, SOCK_CONNECT);
3576 * Enqueue the request.
3578 sock->connect_ev = cdev;
3579 sock->pending_iocp++;
3580 } else {
3581 WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3582 cdev->result = ISC_R_SUCCESS;
3583 isc_task_send(task, (isc_event_t **)&cdev);
3585 CONSISTENT(sock);
3586 UNLOCK(&sock->lock);
3588 return (ISC_R_SUCCESS);
3591 isc_result_t
3592 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3593 isc_result_t result;
3595 REQUIRE(VALID_SOCKET(sock));
3596 REQUIRE(addressp != NULL);
3598 LOCK(&sock->lock);
3599 CONSISTENT(sock);
3602 * make sure that the socket's not closed
3604 if (sock->fd == INVALID_SOCKET) {
3605 UNLOCK(&sock->lock);
3606 return (ISC_R_CONNREFUSED);
3609 if (sock->connected) {
3610 *addressp = sock->address;
3611 result = ISC_R_SUCCESS;
3612 } else {
3613 result = ISC_R_NOTCONNECTED;
3616 UNLOCK(&sock->lock);
3618 return (result);
3621 isc_result_t
3622 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3623 ISC_SOCKADDR_LEN_T len;
3624 isc_result_t result;
3625 char strbuf[ISC_STRERRORSIZE];
3627 REQUIRE(VALID_SOCKET(sock));
3628 REQUIRE(addressp != NULL);
3630 LOCK(&sock->lock);
3631 CONSISTENT(sock);
3634 * make sure that the socket's not closed
3636 if (sock->fd == INVALID_SOCKET) {
3637 UNLOCK(&sock->lock);
3638 return (ISC_R_CONNREFUSED);
3641 if (!sock->bound) {
3642 result = ISC_R_NOTBOUND;
3643 goto out;
3646 result = ISC_R_SUCCESS;
3648 len = sizeof(addressp->type);
3649 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3650 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3651 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3652 strbuf);
3653 result = ISC_R_UNEXPECTED;
3654 goto out;
3656 addressp->length = (unsigned int)len;
3658 out:
3659 UNLOCK(&sock->lock);
3661 return (result);
3665 * Run through the list of events on this socket, and cancel the ones
3666 * queued for task "task" of type "how". "how" is a bitmask.
3668 void
3669 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3671 REQUIRE(VALID_SOCKET(sock));
3674 * Quick exit if there is nothing to do. Don't even bother locking
3675 * in this case.
3677 if (how == 0)
3678 return;
3680 LOCK(&sock->lock);
3681 CONSISTENT(sock);
3684 * make sure that the socket's not closed
3686 if (sock->fd == INVALID_SOCKET) {
3687 UNLOCK(&sock->lock);
3688 return;
3692 * All of these do the same thing, more or less.
3693 * Each will:
3694 * o If the internal event is marked as "posted" try to
3695 * remove it from the task's queue. If this fails, mark it
3696 * as canceled instead, and let the task clean it up later.
3697 * o For each I/O request for that task of that type, post
3698 * its done event with status of "ISC_R_CANCELED".
3699 * o Reset any state needed.
3702 if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3703 isc_socketevent_t *dev;
3704 isc_socketevent_t *next;
3705 isc_task_t *current_task;
3707 dev = ISC_LIST_HEAD(sock->recv_list);
3708 while (dev != NULL) {
3709 current_task = dev->ev_sender;
3710 next = ISC_LIST_NEXT(dev, ev_link);
3711 if ((task == NULL) || (task == current_task)) {
3712 dev->result = ISC_R_CANCELED;
3713 send_recvdone_event(sock, &dev);
3715 dev = next;
3718 how &= ~ISC_SOCKCANCEL_RECV;
3720 if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3721 isc_socketevent_t *dev;
3722 isc_socketevent_t *next;
3723 isc_task_t *current_task;
3725 dev = ISC_LIST_HEAD(sock->send_list);
3727 while (dev != NULL) {
3728 current_task = dev->ev_sender;
3729 next = ISC_LIST_NEXT(dev, ev_link);
3730 if ((task == NULL) || (task == current_task)) {
3731 dev->result = ISC_R_CANCELED;
3732 send_senddone_event(sock, &dev);
3734 dev = next;
3737 how &= ~ISC_SOCKCANCEL_SEND;
3739 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3740 && !ISC_LIST_EMPTY(sock->accept_list)) {
3741 isc_socket_newconnev_t *dev;
3742 isc_socket_newconnev_t *next;
3743 isc_task_t *current_task;
3745 dev = ISC_LIST_HEAD(sock->accept_list);
3746 while (dev != NULL) {
3747 current_task = dev->ev_sender;
3748 next = ISC_LIST_NEXT(dev, ev_link);
3750 if ((task == NULL) || (task == current_task)) {
3752 dev->newsocket->references--;
3753 closesocket(dev->newsocket->fd);
3754 dev->newsocket->fd = INVALID_SOCKET;
3755 free_socket(&dev->newsocket, __LINE__);
3757 dev->result = ISC_R_CANCELED;
3758 send_acceptdone_event(sock, &dev);
3761 dev = next;
3764 how &= ~ISC_SOCKCANCEL_ACCEPT;
3767 * Connecting is not a list.
3769 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3770 && sock->connect_ev != NULL) {
3771 isc_socket_connev_t *dev;
3772 isc_task_t *current_task;
3774 INSIST(sock->pending_connect);
3776 dev = sock->connect_ev;
3777 current_task = dev->ev_sender;
3779 if ((task == NULL) || (task == current_task)) {
3780 closesocket(sock->fd);
3781 sock->fd = INVALID_SOCKET;
3782 _set_state(sock, SOCK_CLOSED);
3784 sock->connect_ev = NULL;
3785 dev->result = ISC_R_CANCELED;
3786 send_connectdone_event(sock, &dev);
3789 how &= ~ISC_SOCKCANCEL_CONNECT;
3791 maybe_free_socket(&sock, __LINE__);
3794 isc_sockettype_t
3795 isc__socket_gettype(isc_socket_t *sock) {
3796 isc_sockettype_t type;
3798 REQUIRE(VALID_SOCKET(sock));
3800 LOCK(&sock->lock);
3803 * make sure that the socket's not closed
3805 if (sock->fd == INVALID_SOCKET) {
3806 UNLOCK(&sock->lock);
3807 return (ISC_R_CONNREFUSED);
3810 type = sock->type;
3811 UNLOCK(&sock->lock);
3812 return (type);
3815 isc_boolean_t
3816 isc__socket_isbound(isc_socket_t *sock) {
3817 isc_boolean_t val;
3819 REQUIRE(VALID_SOCKET(sock));
3821 LOCK(&sock->lock);
3822 CONSISTENT(sock);
3825 * make sure that the socket's not closed
3827 if (sock->fd == INVALID_SOCKET) {
3828 UNLOCK(&sock->lock);
3829 return (ISC_FALSE);
3832 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3833 UNLOCK(&sock->lock);
3835 return (val);
3838 void
3839 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3840 #if defined(IPV6_V6ONLY)
3841 int onoff = yes ? 1 : 0;
3842 #else
3843 UNUSED(yes);
3844 #endif
3846 REQUIRE(VALID_SOCKET(sock));
3848 #ifdef IPV6_V6ONLY
3849 if (sock->pf == AF_INET6) {
3850 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3851 (char *)&onoff, sizeof(onoff));
3853 #endif
3856 void
3857 isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
3858 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
3859 UNUSED(dscp);
3860 #else
3861 if (dscp < 0)
3862 return;
3864 dscp <<= 2;
3865 dscp &= 0xff;
3866 #endif
3868 REQUIRE(VALID_SOCKET(sock));
3870 #ifdef IP_TOS
3871 if (sock->pf == AF_INET) {
3872 (void)setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
3873 (char *)&dscp, sizeof(dscp));
3875 #endif
3876 #ifdef IPV6_TCLASS
3877 if (sock->pf == AF_INET6) {
3878 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
3879 (char *)&dscp, sizeof(dscp));
3881 #endif
3884 void
3885 isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3886 UNUSED(addr);
3887 UNUSED(active);
3890 isc_result_t
3891 isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3892 isc_uint32_t owner, isc_uint32_t group)
3894 UNUSED(addr);
3895 UNUSED(perm);
3896 UNUSED(owner);
3897 UNUSED(group);
3898 return (ISC_R_NOTIMPLEMENTED);
3901 void
3902 isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3905 * Name 'socket'.
3908 REQUIRE(VALID_SOCKET(socket));
3910 LOCK(&socket->lock);
3911 memset(socket->name, 0, sizeof(socket->name));
3912 strncpy(socket->name, name, sizeof(socket->name) - 1);
3913 socket->tag = tag;
3914 UNLOCK(&socket->lock);
3917 const char *
3918 isc__socket_getname(isc_socket_t *socket) {
3919 return (socket->name);
3922 void *
3923 isc__socket_gettag(isc_socket_t *socket) {
3924 return (socket->tag);
3928 isc__socket_getfd(isc_socket_t *socket) {
3929 return ((short) socket->fd);
3932 void
3933 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3934 UNUSED(manager);
3935 UNUSED(reserved);
3938 void
3939 isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3941 UNUSED(manager);
3942 UNUSED(maxudp);
3945 isc_socketevent_t *
3946 isc_socket_socketevent(isc_mem_t *mctx, void *sender,
3947 isc_eventtype_t eventtype, isc_taskaction_t action,
3948 void *arg)
3950 return (allocate_socketevent(mctx, sender, eventtype, action, arg));
3953 #ifdef HAVE_LIBXML2
3955 static const char *
3956 _socktype(isc_sockettype_t type) {
3957 if (type == isc_sockettype_udp)
3958 return ("udp");
3959 else if (type == isc_sockettype_tcp)
3960 return ("tcp");
3961 else if (type == isc_sockettype_unix)
3962 return ("unix");
3963 else if (type == isc_sockettype_fdwatch)
3964 return ("fdwatch");
3965 else
3966 return ("not-initialized");
3969 #define TRY0(a) do { xmlrc = (a); if (xmlrc < 0) goto error; } while(/*CONSTCOND*/0)
3971 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3973 isc_socket_t *sock = NULL;
3974 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3975 isc_sockaddr_t addr;
3976 ISC_SOCKADDR_LEN_T len;
3977 int xmlrc;
3979 LOCK(&mgr->lock);
3981 #ifndef ISC_PLATFORM_USETHREADS
3982 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"));
3983 TRY0(xmlTextWriterWriteFormatString(writer, "%d", mgr->refs));
3984 TRY0(xmlTextWriterEndElement(writer));
3985 #endif
3987 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
3988 sock = ISC_LIST_HEAD(mgr->socklist);
3989 while (sock != NULL) {
3990 LOCK(&sock->lock);
3991 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
3993 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
3994 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
3995 TRY0(xmlTextWriterEndElement(writer));
3997 if (sock->name[0] != 0) {
3998 TRY0(xmlTextWriterStartElement(writer,
3999 ISC_XMLCHAR "name"));
4000 TRY0(xmlTextWriterWriteFormatString(writer, "%s",
4001 sock->name));
4002 TRY0(xmlTextWriterEndElement(writer)); /* name */
4005 TRY0(xmlTextWriterStartElement(writer,
4006 ISC_XMLCHAR "references"));
4007 TRY0(xmlTextWriterWriteFormatString(writer, "%d",
4008 sock->references));
4009 TRY0(xmlTextWriterEndElement(writer));
4011 TRY0(xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
4012 ISC_XMLCHAR _socktype(sock->type)));
4014 if (sock->connected) {
4015 isc_sockaddr_format(&sock->address, peerbuf,
4016 sizeof(peerbuf));
4017 TRY0(xmlTextWriterWriteElement(writer,
4018 ISC_XMLCHAR "peer-address",
4019 ISC_XMLCHAR peerbuf));
4022 len = sizeof(addr);
4023 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
4024 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
4025 TRY0(xmlTextWriterWriteElement(writer,
4026 ISC_XMLCHAR "local-address",
4027 ISC_XMLCHAR peerbuf));
4030 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
4031 if (sock->pending_recv)
4032 TRY0(xmlTextWriterWriteElement(writer,
4033 ISC_XMLCHAR "state",
4034 ISC_XMLCHAR "pending-receive"));
4035 if (sock->pending_send)
4036 TRY0(xmlTextWriterWriteElement(writer,
4037 ISC_XMLCHAR "state",
4038 ISC_XMLCHAR "pending-send"));
4039 if (sock->pending_accept)
4040 TRY0(xmlTextWriterWriteElement(writer,
4041 ISC_XMLCHAR "state",
4042 ISC_XMLCHAR "pending_accept"));
4043 if (sock->listener)
4044 TRY0(xmlTextWriterWriteElement(writer,
4045 ISC_XMLCHAR "state",
4046 ISC_XMLCHAR "listener"));
4047 if (sock->connected)
4048 TRY0(xmlTextWriterWriteElement(writer,
4049 ISC_XMLCHAR "state",
4050 ISC_XMLCHAR "connected"));
4051 if (sock->pending_connect)
4052 TRY0(xmlTextWriterWriteElement(writer,
4053 ISC_XMLCHAR "state",
4054 ISC_XMLCHAR "connecting"));
4055 if (sock->bound)
4056 TRY0(xmlTextWriterWriteElement(writer,
4057 ISC_XMLCHAR "state",
4058 ISC_XMLCHAR "bound"));
4060 TRY0(xmlTextWriterEndElement(writer)); /* states */
4062 TRY0(xmlTextWriterEndElement(writer)); /* socket */
4064 UNLOCK(&sock->lock);
4065 sock = ISC_LIST_NEXT(sock, link);
4067 TRY0(xmlTextWriterEndElement(writer)); /* sockets */
4069 error:
4070 if (sock != NULL)
4071 UNLOCK(&sock->lock);
4073 UNLOCK(&mgr->lock);
4075 return (xmlrc);
4077 #endif /* HAVE_LIBXML2 */
4080 * Replace ../socket_api.c
4083 isc_result_t
4084 isc__socket_register(void) {
4085 return (ISC_R_SUCCESS);
4088 isc_result_t
4089 isc_socketmgr_createinctx(isc_mem_t *mctx, isc_appctx_t *actx,
4090 isc_socketmgr_t **managerp)
4092 isc_result_t result;
4094 result = isc_socketmgr_create(mctx, managerp);
4096 if (result == ISC_R_SUCCESS)
4097 isc_appctx_setsocketmgr(actx, *managerp);
4099 return (result);