2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * RAW - implementation of IP "raw" sockets.
8 * Version: $Id: raw.c,v 1.43 1999/08/20 11:05:57 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared skbuff
18 * library. No more peek crashes, no more backlogs
19 * Alan Cox : Checks sk->broadcast.
20 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
21 * Alan Cox : Raw passes ip options too
22 * Alan Cox : Setsocketopt added
23 * Alan Cox : Fixed error return for broadcasts
24 * Alan Cox : Removed wake_up calls
25 * Alan Cox : Use ttl/tos
26 * Alan Cox : Cleaned up old debugging
27 * Alan Cox : Use new kernel side addresses
28 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
29 * Alan Cox : BSD style RAW socket demultiplexing.
30 * Alan Cox : Beginnings of mrouted support.
31 * Alan Cox : Added IP_HDRINCL option.
32 * Alan Cox : Skip broadcast check if BSDism set.
33 * David S. Miller : New socket lookup architecture.
35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version.
41 #include <linux/config.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <linux/types.h>
45 #include <linux/sched.h>
46 #include <linux/errno.h>
47 #include <linux/timer.h>
49 #include <linux/kernel.h>
50 #include <linux/fcntl.h>
51 #include <linux/socket.h>
53 #include <linux/inet.h>
54 #include <linux/netdevice.h>
55 #include <linux/mroute.h>
57 #include <net/protocol.h>
58 #include <linux/skbuff.h>
63 #include <net/inet_common.h>
64 #include <net/checksum.h>
66 struct sock
*raw_v4_htable
[RAWV4_HTABLE_SIZE
];
67 rwlock_t raw_v4_lock
= RW_LOCK_UNLOCKED
;
69 static void raw_v4_hash(struct sock
*sk
)
71 struct sock
**skp
= &raw_v4_htable
[sk
->num
& (RAWV4_HTABLE_SIZE
- 1)];
73 write_lock_bh(&raw_v4_lock
);
74 if ((sk
->next
= *skp
) != NULL
)
75 (*skp
)->pprev
= &sk
->next
;
79 if(sk
->prot
->highestinuse
< sk
->prot
->inuse
)
80 sk
->prot
->highestinuse
= sk
->prot
->inuse
;
82 write_unlock_bh(&raw_v4_lock
);
85 static void raw_v4_unhash(struct sock
*sk
)
87 write_lock_bh(&raw_v4_lock
);
90 sk
->next
->pprev
= sk
->pprev
;
91 *sk
->pprev
= sk
->next
;
96 write_unlock_bh(&raw_v4_lock
);
99 struct sock
*__raw_v4_lookup(struct sock
*sk
, unsigned short num
,
100 unsigned long raddr
, unsigned long laddr
,
105 for(s
= sk
; s
; s
= s
->next
) {
106 if((s
->num
== num
) &&
107 !(s
->daddr
&& s
->daddr
!= raddr
) &&
108 !(s
->rcv_saddr
&& s
->rcv_saddr
!= laddr
) &&
109 !(s
->bound_dev_if
&& s
->bound_dev_if
!= dif
))
119 static __inline__
int icmp_filter(struct sock
*sk
, struct sk_buff
*skb
)
123 type
= skb
->h
.icmph
->type
;
125 return test_bit(type
, &sk
->tp_pinfo
.tp_raw4
.filter
);
127 /* Do not block unknown ICMP types */
131 /* IP input processing comes here for RAW socket delivery.
132 * This is fun as to avoid copies we want to make no surplus
135 * RFC 1122: SHOULD pass TOS value up to the transport layer.
136 * -> It does. And not only TOS, but all IP header.
138 struct sock
*raw_v4_input(struct sk_buff
*skb
, struct iphdr
*iph
, int hash
)
142 read_lock(&raw_v4_lock
);
143 if ((sk
= raw_v4_htable
[hash
]) == NULL
)
145 sk
= __raw_v4_lookup(sk
, iph
->protocol
,
146 iph
->saddr
, iph
->daddr
,
150 struct sock
*sknext
= __raw_v4_lookup(sk
->next
, iph
->protocol
,
151 iph
->saddr
, iph
->daddr
,
153 if (iph
->protocol
!= IPPROTO_ICMP
||
154 ! icmp_filter(sk
, skb
)) {
155 struct sk_buff
*clone
;
159 clone
= skb_clone(skb
, GFP_ATOMIC
);
160 /* Not releasing hash table! */
169 read_unlock(&raw_v4_lock
);
174 void raw_err (struct sock
*sk
, struct sk_buff
*skb
)
176 int type
= skb
->h
.icmph
->type
;
177 int code
= skb
->h
.icmph
->code
;
182 /* Report error on raw socket, if:
183 1. User requested ip_recverr.
184 2. Socket is connected (otherwise the error indication
185 is useless without ip_recverr and error is hard.
187 if (!sk
->protinfo
.af_inet
.recverr
&& sk
->state
!= TCP_ESTABLISHED
)
192 case ICMP_TIME_EXCEEDED
:
195 case ICMP_SOURCE_QUENCH
:
197 case ICMP_PARAMETERPROB
:
199 info
= ntohl(skb
->h
.icmph
->un
.gateway
)>>24;
202 case ICMP_DEST_UNREACH
:
204 if (code
> NR_ICMP_UNREACH
)
206 err
= icmp_err_convert
[code
].errno
;
207 harderr
= icmp_err_convert
[code
].fatal
;
208 if (code
== ICMP_FRAG_NEEDED
) {
209 harderr
= (sk
->protinfo
.af_inet
.pmtudisc
!= IP_PMTUDISC_DONT
);
211 info
= ntohs(skb
->h
.icmph
->un
.frag
.mtu
);
215 if (sk
->protinfo
.af_inet
.recverr
)
216 ip_icmp_error(sk
, skb
, err
, 0, info
, (u8
*)(skb
->h
.icmph
+ 1));
218 if (sk
->protinfo
.af_inet
.recverr
|| harderr
) {
220 sk
->error_report(sk
);
224 static int raw_rcv_skb(struct sock
* sk
, struct sk_buff
* skb
)
226 /* Charge it to the socket. */
228 if (sock_queue_rcv_skb(sk
,skb
)<0)
230 ip_statistics
.IpInDiscards
++;
235 ip_statistics
.IpInDelivers
++;
240 * This should be the easiest of all, all we do is
241 * copy it into a buffer. All demultiplexing is done
245 int raw_rcv(struct sock
*sk
, struct sk_buff
*skb
)
247 /* Now we need to copy this into memory. */
248 skb_trim(skb
, ntohs(skb
->nh
.iph
->tot_len
));
250 skb
->h
.raw
= skb
->nh
.raw
;
252 raw_rcv_skb(sk
, skb
);
263 * Send a RAW IP packet.
267 * Callback support is trivial for SOCK_RAW
270 static int raw_getfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
272 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
273 return memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
);
277 * IPPROTO_RAW needs extra work.
280 static int raw_getrawfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
282 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
284 if (memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
))
288 struct iphdr
*iph
= (struct iphdr
*)to
;
290 iph
->saddr
= rfh
->saddr
;
292 iph
->tot_len
=htons(fraglen
); /* This is right as you can't frag
295 * Deliberate breach of modularity to keep
296 * ip_build_xmit clean (well less messy).
299 iph
->id
= htons(ip_id_count
++);
300 iph
->check
=ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
305 static int raw_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
307 struct ipcm_cookie ipc
;
308 struct rawfakehdr rfh
;
309 struct rtable
*rt
= NULL
;
315 /* This check is ONLY to check for arithmetic overflow
316 on integer(!) len. Not more! Real check will be made
317 in ip_build_xmit --ANK
319 BTW socket.c -> af_*.c -> ... make multiple
320 invalid conversions size_t -> int. We MUST repair it f.e.
321 by replacing all of them with size_t and revise all
322 the places sort of len += sizeof(struct iphdr)
323 If len was ULONG_MAX-10 it would be cathastrophe --ANK
326 if (len
< 0 || len
> 0xFFFF)
333 if (msg
->msg_flags
& MSG_OOB
) /* Mirror BSD error message compatibility */
337 * Get and verify the address.
340 if (msg
->msg_namelen
) {
341 struct sockaddr_in
*usin
= (struct sockaddr_in
*)msg
->msg_name
;
342 if (msg
->msg_namelen
< sizeof(*usin
))
344 if (usin
->sin_family
!= AF_INET
) {
345 static int complained
;
347 printk(KERN_INFO
"%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current
->comm
);
348 if (usin
->sin_family
)
351 daddr
= usin
->sin_addr
.s_addr
;
352 /* ANK: I did not forget to get protocol from port field.
353 * I just do not know, who uses this weirdness.
354 * IP_HDRINCL is much more convenient.
357 if (sk
->state
!= TCP_ESTABLISHED
)
362 ipc
.addr
= sk
->saddr
;
364 ipc
.oif
= sk
->bound_dev_if
;
366 if (msg
->msg_controllen
) {
367 int tmp
= ip_cmsg_send(msg
, &ipc
);
374 rfh
.saddr
= ipc
.addr
;
378 ipc
.opt
= sk
->protinfo
.af_inet
.opt
;
382 /* Linux does not mangle headers on raw sockets,
383 * so that IP options + IP_HDRINCL is non-sense.
385 if (sk
->protinfo
.af_inet
.hdrincl
)
390 daddr
= ipc
.opt
->faddr
;
393 tos
= RT_TOS(sk
->protinfo
.af_inet
.tos
) | sk
->localroute
;
394 if (msg
->msg_flags
&MSG_DONTROUTE
)
397 if (MULTICAST(daddr
)) {
399 ipc
.oif
= sk
->protinfo
.af_inet
.mc_index
;
401 rfh
.saddr
= sk
->protinfo
.af_inet
.mc_addr
;
404 err
= ip_route_output(&rt
, daddr
, rfh
.saddr
, tos
, ipc
.oif
);
410 if (rt
->rt_flags
&RTCF_BROADCAST
&& !sk
->broadcast
)
413 if (msg
->msg_flags
&MSG_CONFIRM
)
417 rfh
.iov
= msg
->msg_iov
;
418 rfh
.saddr
= rt
->rt_src
;
420 ipc
.addr
= rt
->rt_dst
;
421 err
=ip_build_xmit(sk
, sk
->protinfo
.af_inet
.hdrincl
? raw_getrawfrag
: raw_getfrag
,
422 &rfh
, len
, &ipc
, rt
, msg
->msg_flags
);
429 return err
<0 ? err
: len
;
432 dst_confirm(&rt
->u
.dst
);
433 if (!(msg
->msg_flags
&MSG_PROBE
) || len
)
434 goto back_from_confirm
;
439 static void raw_close(struct sock
*sk
, long timeout
)
442 * Raw sockets may have direct kernel refereneces. Kill them.
444 ip_ra_control(sk
, 0, NULL
);
446 inet_sock_release(sk
);
449 /* This gets rid of all the nasties in af_inet. -DaveM */
450 static int raw_bind(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
452 struct sockaddr_in
*addr
= (struct sockaddr_in
*) uaddr
;
455 if((sk
->state
!= TCP_CLOSE
) || (addr_len
< sizeof(struct sockaddr_in
)))
457 chk_addr_ret
= inet_addr_type(addr
->sin_addr
.s_addr
);
458 if(addr
->sin_addr
.s_addr
!= 0 && chk_addr_ret
!= RTN_LOCAL
&&
459 chk_addr_ret
!= RTN_MULTICAST
&& chk_addr_ret
!= RTN_BROADCAST
)
460 return -EADDRNOTAVAIL
;
461 sk
->rcv_saddr
= sk
->saddr
= addr
->sin_addr
.s_addr
;
462 if(chk_addr_ret
== RTN_MULTICAST
|| chk_addr_ret
== RTN_BROADCAST
)
463 sk
->saddr
= 0; /* Use device */
469 * This should be easy, if there is something there
470 * we return it, otherwise we block.
473 int raw_recvmsg(struct sock
*sk
, struct msghdr
*msg
, int len
,
474 int noblock
, int flags
,int *addr_len
)
479 struct sockaddr_in
*sin
=(struct sockaddr_in
*)msg
->msg_name
;
485 *addr_len
=sizeof(*sin
);
487 if (flags
& MSG_ERRQUEUE
)
488 return ip_recv_error(sk
, msg
, len
);
490 skb
=skb_recv_datagram(sk
,flags
,noblock
,&err
);
497 msg
->msg_flags
|= MSG_TRUNC
;
501 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, copied
);
505 sk
->stamp
=skb
->stamp
;
507 /* Copy the address. */
509 sin
->sin_family
= AF_INET
;
510 sin
->sin_addr
.s_addr
= skb
->nh
.iph
->saddr
;
512 if (sk
->protinfo
.af_inet
.cmsg_flags
)
513 ip_cmsg_recv(msg
, skb
);
515 skb_free_datagram(sk
, skb
);
516 return (err
? : copied
);
519 static int raw_init(struct sock
*sk
)
521 struct raw_opt
*tp
= &(sk
->tp_pinfo
.tp_raw4
);
522 if (sk
->num
== IPPROTO_ICMP
)
523 memset(&tp
->filter
, 0, sizeof(tp
->filter
));
527 static int raw_seticmpfilter(struct sock
*sk
, char *optval
, int optlen
)
529 if (optlen
> sizeof(struct icmp_filter
))
530 optlen
= sizeof(struct icmp_filter
);
531 if (copy_from_user(&sk
->tp_pinfo
.tp_raw4
.filter
, optval
, optlen
))
536 static int raw_geticmpfilter(struct sock
*sk
, char *optval
, int *optlen
)
540 if (get_user(len
,optlen
))
542 if (len
> sizeof(struct icmp_filter
))
543 len
= sizeof(struct icmp_filter
);
544 if (put_user(len
, optlen
))
546 if (copy_to_user(optval
, &sk
->tp_pinfo
.tp_raw4
.filter
, len
))
551 static int raw_setsockopt(struct sock
*sk
, int level
, int optname
,
552 char *optval
, int optlen
)
554 if (level
!= SOL_RAW
)
555 return ip_setsockopt(sk
, level
, optname
, optval
, optlen
);
559 if (sk
->num
!= IPPROTO_ICMP
)
561 return raw_seticmpfilter(sk
, optval
, optlen
);
567 static int raw_getsockopt(struct sock
*sk
, int level
, int optname
,
568 char *optval
, int *optlen
)
570 if (level
!= SOL_RAW
)
571 return ip_getsockopt(sk
, level
, optname
, optval
, optlen
);
575 if (sk
->num
!= IPPROTO_ICMP
)
577 return raw_geticmpfilter(sk
, optval
, optlen
);
583 static void get_raw_sock(struct sock
*sp
, char *tmpbuf
, int i
)
585 unsigned int dest
, src
;
588 unsigned long timer_expires
;
594 timer_active
= (sp
->timer
.prev
!= NULL
) ? 2 : 0;
595 timer_expires
= (timer_active
== 2 ? sp
->timer
.expires
: jiffies
);
596 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
597 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
598 i
, src
, srcp
, dest
, destp
, sp
->state
,
599 atomic_read(&sp
->wmem_alloc
), atomic_read(&sp
->rmem_alloc
),
600 timer_active
, timer_expires
-jiffies
, 0,
601 sp
->socket
->inode
->i_uid
, 0,
602 sp
->socket
? sp
->socket
->inode
->i_ino
: 0,
603 atomic_read(&sp
->refcnt
), sp
);
606 int raw_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
608 int len
= 0, num
= 0, i
;
614 len
+= sprintf(buffer
, "%-127s\n",
615 " sl local_address rem_address st tx_queue "
616 "rx_queue tr tm->when retrnsmt uid timeout inode");
618 read_lock(&raw_v4_lock
);
619 for (i
= 0; i
< RAWV4_HTABLE_SIZE
; i
++) {
622 for (sk
= raw_v4_htable
[i
]; sk
; sk
= sk
->next
, num
++) {
623 if (sk
->family
!= PF_INET
)
628 get_raw_sock(sk
, tmpbuf
, i
);
629 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
635 read_unlock(&raw_v4_lock
);
636 begin
= len
- (pos
- offset
);
637 *start
= buffer
+ begin
;
646 struct proto raw_prot
= {
647 raw_close
, /* close */
648 udp_connect
, /* connect */
649 udp_disconnect
, /* disconnect */
651 NULL
, /* retransmit */
652 NULL
, /* write_wakeup */
653 NULL
, /* read_wakeup */
654 datagram_poll
, /* poll */
655 #ifdef CONFIG_IP_MROUTE
656 ipmr_ioctl
, /* ioctl */
663 raw_setsockopt
, /* setsockopt */
664 raw_getsockopt
, /* getsockopt */
665 raw_sendmsg
, /* sendmsg */
666 raw_recvmsg
, /* recvmsg */
668 raw_rcv_skb
, /* backlog_rcv */
669 raw_v4_hash
, /* hash */
670 raw_v4_unhash
, /* unhash */
672 128, /* max_header */