3 Copyright (c) 2008 by Genome Research Ltd (GRL).
4 2010 by Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 /* Probably I will not do socket programming in the next few years and
28 therefore I decide to heavily annotate this file, for Linux and
29 Windows as well. -ac */
40 #include <sys/types.h>
44 #include <arpa/inet.h>
45 #include <sys/socket.h>
46 #include <sys/select.h>
49 #include "htslib/knetfile.h"
50 #include "htslib/hts_log.h"
52 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
53 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
54 * integer -1. In knetfile.c, I use "int" for socket type
55 * throughout. This should be improved to avoid confusion.
57 * In Linux/Mac, recv() and read() do almost the same thing. You can see
58 * in the header file that netread() is simply an alias of read(). In
59 * Windows, however, they are different and using recv() is mandatory.
62 /* This function tests if the file handler is ready for reading (or
63 * writing if is_read==0). */
64 static int socket_wait(int fd
, int is_read
)
66 fd_set fds
, *fdr
= 0, *fdw
= 0;
69 tv
.tv_sec
= 5; tv
.tv_usec
= 0; // 5 seconds time out
72 if (is_read
) fdr
= &fds
;
74 ret
= select(fd
+1, fdr
, fdw
, 0, &tv
);
76 if (ret
== -1) perror("select");
79 hts_log_warning("Select timed out");
80 else if (ret
== SOCKET_ERROR
)
81 hts_log_error("Select returned error %d", WSAGetLastError());
87 /* This function does not work with Windows due to the lack of
88 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
89 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
90 static int socket_connect(const char *host
, const char *port
)
92 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
94 int ai_err
, on
= 1, fd
;
95 struct linger lng
= { 0, 0 };
96 struct addrinfo hints
, *res
= 0;
97 memset(&hints
, 0, sizeof(struct addrinfo
));
98 hints
.ai_family
= AF_UNSPEC
;
99 hints
.ai_socktype
= SOCK_STREAM
;
100 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
101 * server information. */
102 if ((ai_err
= getaddrinfo(host
, port
, &hints
, &res
)) != 0) { hts_log_error("Can't resolve %s:%s: %s", host
, port
, gai_strerror(ai_err
)); return -1; }
103 if ((fd
= socket(res
->ai_family
, res
->ai_socktype
, res
->ai_protocol
)) == -1) __err_connect("socket");
104 /* The following two setsockopt() are used by ftplib
105 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
107 if (setsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, &on
, sizeof(on
)) == -1) __err_connect("setsockopt");
108 if (setsockopt(fd
, SOL_SOCKET
, SO_LINGER
, &lng
, sizeof(lng
)) == -1) __err_connect("setsockopt");
109 if (connect(fd
, res
->ai_addr
, res
->ai_addrlen
) != 0) __err_connect("connect");
114 /* MinGW's printf has problem with "%lld" */
115 char *int64tostr(char *buf
, int64_t x
)
120 buf
[i
++] = '0' + x
% 10;
124 for (cnt
= i
, i
= 0; i
< cnt
/2; ++i
) {
125 int c
= buf
[i
]; buf
[i
] = buf
[cnt
-i
-1]; buf
[cnt
-i
-1] = c
;
130 int64_t strtoint64(const char *buf
)
133 for (x
= 0; *buf
!= '\0'; ++buf
)
134 x
= x
* 10 + ((int64_t) *buf
- 48);
137 /* In windows, the first thing is to establish the TCP connection. */
138 int knet_win32_init()
141 return WSAStartup(MAKEWORD(2, 2), &wsaData
);
143 void knet_win32_destroy()
147 /* A slightly modfied version of the following function also works on
148 * Mac (and presummably Linux). However, this function is not stable on
149 * my Mac. It sometimes works fine but sometimes does not. Therefore for
150 * non-Windows OS, I do not use this one. */
151 static SOCKET
socket_connect(const char *host
, const char *port
)
153 #define __err_connect(func) \
155 hts_log_error("The %s operation returned error %d", func, WSAGetLastError()); \
161 struct linger lng
= { 0, 0 };
162 struct sockaddr_in server
;
163 struct hostent
*hp
= 0;
165 if ((fd
= socket(AF_INET
, SOCK_STREAM
, IPPROTO_TCP
)) == INVALID_SOCKET
) __err_connect("socket");
166 if (setsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, (char*)&on
, sizeof(on
)) == -1) __err_connect("setsockopt");
167 if (setsockopt(fd
, SOL_SOCKET
, SO_LINGER
, (char*)&lng
, sizeof(lng
)) == -1) __err_connect("setsockopt");
169 if (isalpha(host
[0])) hp
= gethostbyname(host
);
172 addr
.s_addr
= inet_addr(host
);
173 hp
= gethostbyaddr((char*)&addr
, 4, AF_INET
);
175 if (hp
== 0) __err_connect("gethost");
177 server
.sin_addr
.s_addr
= *((unsigned long*)hp
->h_addr
);
178 server
.sin_family
= AF_INET
;
179 server
.sin_port
= htons(atoi(port
));
180 if (connect(fd
, (struct sockaddr
*)&server
, sizeof(server
)) != 0) __err_connect("connect");
181 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
186 static off_t
my_netread(int fd
, void *buf
, off_t len
)
188 off_t rest
= len
, curr
, l
= 0;
189 /* recv() and read() may not read the required length of data with
190 * one call. They have to be called repeatedly. */
192 if (socket_wait(fd
, 1) <= 0) break; // socket is not ready for reading
193 curr
= netread(fd
, (void*)((char*)buf
+ l
), rest
);
194 /* According to the glibc manual, section 13.2, a zero returned
195 * value indicates end-of-file (EOF), which should mean that
196 * read() will not return zero if EOF has not been met but data
197 * are not immediately available. */
198 if (curr
== 0) break;
199 l
+= curr
; rest
-= curr
;
204 /*************************
205 * FTP specific routines *
206 *************************/
208 static int kftp_get_response(knetFile
*ftp
)
217 if (socket_wait(ftp
->ctrl_fd
, 1) <= 0) return 0;
218 while (netread(ftp
->ctrl_fd
, &c
, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
220 if (n
>= ftp
->max_response
) {
221 ftp
->max_response
= ftp
->max_response
? ftp
->max_response
<<1 : 256;
222 ftp
->response
= (char*)realloc(ftp
->response
, ftp
->max_response
);
224 ftp
->response
[n
++] = c
;
226 if (n
>= 4 && isdigit(ftp
->response
[0]) && isdigit(ftp
->response
[1]) && isdigit(ftp
->response
[2])
227 && ftp
->response
[3] != '-') break;
232 if (n
< 2) return -1;
233 ftp
->response
[n
-2] = 0;
234 return strtol(ftp
->response
, &p
, 0);
237 static int kftp_send_cmd(knetFile
*ftp
, const char *cmd
, int is_get
)
239 if (socket_wait(ftp
->ctrl_fd
, 0) <= 0) return -1; // socket is not ready for writing
240 int len
= strlen(cmd
);
241 if ( netwrite(ftp
->ctrl_fd
, cmd
, len
) != len
) return -1;
242 return is_get
? kftp_get_response(ftp
) : 0;
245 static int kftp_pasv_prep(knetFile
*ftp
)
249 kftp_send_cmd(ftp
, "PASV\r\n", 1);
250 for (p
= ftp
->response
; *p
&& *p
!= '('; ++p
);
251 if (*p
!= '(') return -1;
253 sscanf(p
, "%d,%d,%d,%d,%d,%d", &v
[0], &v
[1], &v
[2], &v
[3], &v
[4], &v
[5]);
254 memcpy(ftp
->pasv_ip
, v
, 4 * sizeof(int));
255 ftp
->pasv_port
= (v
[4]<<8&0xff00) + v
[5];
260 static int kftp_pasv_connect(knetFile
*ftp
)
262 char host
[80], port
[10];
263 if (ftp
->pasv_port
== 0) {
264 hts_log_error("Must call kftp_pasv_prep() first");
267 sprintf(host
, "%d.%d.%d.%d", ftp
->pasv_ip
[0], ftp
->pasv_ip
[1], ftp
->pasv_ip
[2], ftp
->pasv_ip
[3]);
268 sprintf(port
, "%d", ftp
->pasv_port
);
269 ftp
->fd
= socket_connect(host
, port
);
270 if (ftp
->fd
== -1) return -1;
274 int kftp_connect(knetFile
*ftp
)
276 ftp
->ctrl_fd
= socket_connect(ftp
->host
, ftp
->port
);
277 if (ftp
->ctrl_fd
== -1) return -1;
278 kftp_get_response(ftp
);
279 kftp_send_cmd(ftp
, "USER anonymous\r\n", 1);
280 kftp_send_cmd(ftp
, "PASS kftp@\r\n", 1);
281 kftp_send_cmd(ftp
, "TYPE I\r\n", 1);
285 int kftp_reconnect(knetFile
*ftp
)
287 if (ftp
->ctrl_fd
!= -1) {
288 netclose(ftp
->ctrl_fd
);
293 return kftp_connect(ftp
);
296 // initialize ->type, ->host, ->retr and ->size
297 knetFile
*kftp_parse_url(const char *fn
, const char *mode
)
302 if (strstr(fn
, "ftp://") != fn
) return 0;
303 for (p
= (char*)fn
+ 6; *p
&& *p
!= '/'; ++p
);
304 if (*p
!= '/') return 0;
306 fp
= (knetFile
*)calloc(1, sizeof(knetFile
));
307 fp
->type
= KNF_TYPE_FTP
;
309 /* the Linux/Mac version of socket_connect() also recognizes a port
310 * like "ftp", but the Windows version does not. */
311 fp
->port
= strdup("21");
312 fp
->host
= (char*)calloc(l
+ 1, 1);
313 if (strchr(mode
, 'c')) fp
->no_reconnect
= 1;
314 strncpy(fp
->host
, fn
+ 6, l
);
315 fp
->retr
= (char*)calloc(strlen(p
) + 8, 1);
316 sprintf(fp
->retr
, "RETR %s\r\n", p
);
317 fp
->size_cmd
= (char*)calloc(strlen(p
) + 8, 1);
318 sprintf(fp
->size_cmd
, "SIZE %s\r\n", p
);
322 // place ->fd at offset off
323 int kftp_connect_file(knetFile
*fp
)
329 if (fp
->no_reconnect
) kftp_get_response(fp
);
332 kftp_send_cmd(fp
, fp
->size_cmd
, 1);
334 // If the file does not exist, the response will be "550 Could not get file
335 // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi.
336 if ( sscanf(fp
->response
,"%*d %lld", &file_size
) != 1 ) return -1;
338 const char *p
= fp
->response
;
339 while (*p
!= ' ') ++p
;
340 while (*p
< '0' || *p
> '9') ++p
;
341 file_size
= strtoint64(p
);
343 fp
->file_size
= file_size
;
347 sprintf(tmp
, "REST %lld\r\n", (long long)fp
->offset
);
349 strcpy(tmp
, "REST ");
350 int64tostr(tmp
+ 5, fp
->offset
);
353 kftp_send_cmd(fp
, tmp
, 1);
355 kftp_send_cmd(fp
, fp
->retr
, 0);
356 kftp_pasv_connect(fp
);
357 ret
= kftp_get_response(fp
);
359 hts_log_error("%s", fp
->response
);
369 /**************************
370 * HTTP specific routines *
371 **************************/
373 knetFile
*khttp_parse_url(const char *fn
, const char *mode
)
378 if (strstr(fn
, "http://") != fn
) return 0;
380 for (p
= (char*)fn
+ 7; *p
&& *p
!= '/'; ++p
);
382 fp
= (knetFile
*)calloc(1, sizeof(knetFile
));
383 fp
->http_host
= (char*)calloc(l
+ 1, 1);
384 strncpy(fp
->http_host
, fn
+ 7, l
);
385 fp
->http_host
[l
] = 0;
386 for (q
= fp
->http_host
; *q
&& *q
!= ':'; ++q
);
387 if (*q
== ':') *q
++ = 0;
389 proxy
= getenv("http_proxy");
390 // set ->host, ->port and ->path
392 fp
->host
= strdup(fp
->http_host
); // when there is no proxy, server name is identical to http_host name.
393 fp
->port
= strdup(*q
? q
: "80");
394 fp
->path
= strdup(*p
? p
: "/");
396 fp
->host
= (strstr(proxy
, "http://") == proxy
)? strdup(proxy
+ 7) : strdup(proxy
);
397 for (q
= fp
->host
; *q
&& *q
!= ':'; ++q
);
398 if (*q
== ':') *q
++ = 0;
399 fp
->port
= strdup(*q
? q
: "80");
400 fp
->path
= strdup(fn
);
402 fp
->type
= KNF_TYPE_HTTP
;
403 fp
->ctrl_fd
= fp
->fd
= -1;
408 int khttp_connect_file(knetFile
*fp
)
412 if (fp
->fd
!= -1) netclose(fp
->fd
);
413 fp
->fd
= socket_connect(fp
->host
, fp
->port
);
414 buf
= (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
415 l
+= sprintf(buf
+ l
, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp
->path
, fp
->http_host
);
416 if (fp
->offset
!= 0) l
+= sprintf(buf
+ l
, "Range: bytes=%lld-\r\n", (long long)fp
->offset
);
417 l
+= sprintf(buf
+ l
, "\r\n");
418 if ( netwrite(fp
->fd
, buf
, l
) != l
) { free(buf
); return -1; }
420 while (netread(fp
->fd
, buf
+ l
, 1)) { // read HTTP header; FIXME: bad efficiency
421 if (buf
[l
] == '\n' && l
>= 3)
422 if (strncmp(buf
+ l
- 3, "\r\n\r\n", 4) == 0) break;
426 if (l
< 14) { // prematured header
432 ret
= strtol(buf
+ 8, &p
, 0); // HTTP return code
433 if (ret
== 200 && fp
->offset
>0) { // 200 (complete result); then skip beginning of the file
434 off_t rest
= fp
->offset
;
436 off_t l
= rest
< 0x10000? rest
: 0x10000;
437 rest
-= my_netread(fp
->fd
, buf
, l
);
439 } else if (ret
!= 206 && ret
!= 200) {
440 // failed to open file
444 case 401: errno
= EPERM
; break;
445 case 403: errno
= EACCES
; break;
446 case 404: errno
= ENOENT
; break;
447 case 407: errno
= EPERM
; break;
448 case 408: errno
= ETIMEDOUT
; break;
449 case 410: errno
= ENOENT
; break;
450 case 503: errno
= EAGAIN
; break;
451 case 504: errno
= ETIMEDOUT
; break;
452 default: errno
= (ret
>= 400 && ret
< 500)? EINVAL
: EIO
; break;
462 /********************
464 ********************/
466 knetFile
*knet_open(const char *fn
, const char *mode
)
469 if (mode
[0] != 'r') {
470 hts_log_error("Only mode \"r\" is supported");
474 if (strstr(fn
, "ftp://") == fn
) {
475 fp
= kftp_parse_url(fn
, mode
);
476 if (fp
== 0) return 0;
477 if (kftp_connect(fp
) == -1) {
481 kftp_connect_file(fp
);
482 } else if (strstr(fn
, "http://") == fn
) {
483 fp
= khttp_parse_url(fn
, mode
);
484 if (fp
== 0) return 0;
485 khttp_connect_file(fp
);
486 } else { // local file
488 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
489 * be undefined on some systems, although it is defined on my
490 * Mac and the Linux I have tested on. */
491 int fd
= open(fn
, O_RDONLY
| O_BINARY
);
493 int fd
= open(fn
, O_RDONLY
);
499 fp
= (knetFile
*)calloc(1, sizeof(knetFile
));
500 fp
->type
= KNF_TYPE_LOCAL
;
504 if (fp
&& fp
->fd
== -1) {
511 knetFile
*knet_dopen(int fd
, const char *mode
)
513 knetFile
*fp
= (knetFile
*)calloc(1, sizeof(knetFile
));
514 fp
->type
= KNF_TYPE_LOCAL
;
519 ssize_t
knet_read(knetFile
*fp
, void *buf
, size_t len
)
522 if (fp
->fd
== -1) return 0;
523 if (fp
->type
== KNF_TYPE_FTP
) {
524 if (fp
->is_ready
== 0) {
525 if (!fp
->no_reconnect
) kftp_reconnect(fp
);
526 kftp_connect_file(fp
);
528 } else if (fp
->type
== KNF_TYPE_HTTP
) {
529 if (fp
->is_ready
== 0)
530 khttp_connect_file(fp
);
532 if (fp
->type
== KNF_TYPE_LOCAL
) { // on Windows, the following block is necessary; not on UNIX
537 curr
= read(fp
->fd
, (void*)((char*)buf
+ l
), rest
);
538 } while (curr
< 0 && EINTR
== errno
);
539 if (curr
< 0) return -1;
540 if (curr
== 0) break;
541 l
+= curr
; rest
-= curr
;
543 } else l
= my_netread(fp
->fd
, buf
, len
);
548 off_t
knet_seek(knetFile
*fp
, off_t off
, int whence
)
550 if (whence
== SEEK_SET
&& off
== fp
->offset
) return 0;
551 if (fp
->type
== KNF_TYPE_LOCAL
) {
552 /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */
553 off_t offset
= lseek(fp
->fd
, off
, whence
);
554 if (offset
== -1) return -1;
557 } else if (fp
->type
== KNF_TYPE_FTP
) {
558 if (whence
== SEEK_CUR
) fp
->offset
+= off
;
559 else if (whence
== SEEK_SET
) fp
->offset
= off
;
560 else if (whence
== SEEK_END
) fp
->offset
= fp
->file_size
+ off
;
564 } else if (fp
->type
== KNF_TYPE_HTTP
) {
565 if (whence
== SEEK_END
) { // FIXME: can we allow SEEK_END in future?
566 hts_log_error("SEEK_END is not supported for HTTP. Offset is unchanged");
570 if (whence
== SEEK_CUR
) fp
->offset
+= off
;
571 else if (whence
== SEEK_SET
) fp
->offset
= off
;
577 hts_log_error("%s", strerror(errno
));
581 int knet_close(knetFile
*fp
)
583 if (fp
== 0) return 0;
584 if (fp
->ctrl_fd
!= -1) netclose(fp
->ctrl_fd
); // FTP specific
586 /* On Linux/Mac, netclose() is an alias of close(), but on
587 * Windows, it is an alias of closesocket(). */
588 if (fp
->type
== KNF_TYPE_LOCAL
) close(fp
->fd
);
589 else netclose(fp
->fd
);
591 free(fp
->host
); free(fp
->port
);
592 free(fp
->response
); free(fp
->retr
); // FTP specific
593 free(fp
->path
); free(fp
->http_host
); // HTTP specific
607 buf
= calloc(0x100000, 1);
609 fp
= knet_open("knetfile.c", "r");
610 knet_seek(fp
, 1000, SEEK_SET
);
611 } else if (type
== 1) { // NCBI FTP, large file
612 fp
= knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
613 knet_seek(fp
, 2500000000ll, SEEK_SET
);
614 l
= knet_read(fp
, buf
, 255);
615 } else if (type
== 2) {
616 fp
= knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
617 knet_seek(fp
, 1000, SEEK_SET
);
618 } else if (type
== 3) {
619 fp
= knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
620 knet_seek(fp
, 1000, SEEK_SET
);
621 } else if (type
== 4) {
622 fp
= knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
623 knet_read(fp
, buf
, 10000);
624 knet_seek(fp
, 20000, SEEK_SET
);
625 knet_seek(fp
, 10000, SEEK_SET
);
626 l
= knet_read(fp
, buf
+10000, 10000000) + 10000;
628 if (type
!= 4 && type
!= 1) {
629 knet_read(fp
, buf
, 255);
632 } else write(fileno(stdout
), buf
, l
);