1 /* Cluster IP hashmark target
2 * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
3 * based on ideas of Fabio Olive Leite <olive@unixforge.org>
5 * Development of this code funded by SuSE Linux AG, http://www.suse.com/
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #include <linux/module.h>
13 #include <linux/config.h>
14 #include <linux/proc_fs.h>
15 #include <linux/jhash.h>
16 #include <linux/skbuff.h>
18 #include <linux/tcp.h>
19 #include <linux/udp.h>
20 #include <linux/icmp.h>
21 #include <linux/if_arp.h>
22 #include <linux/proc_fs.h>
23 #include <linux/seq_file.h>
25 #include <net/checksum.h>
27 #include <linux/netfilter_arp.h>
29 #include <linux/netfilter_ipv4/ip_tables.h>
30 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31 #include <linux/netfilter_ipv4/ip_conntrack.h>
33 #define CLUSTERIP_VERSION "0.7"
35 #define DEBUG_CLUSTERIP
37 #ifdef DEBUG_CLUSTERIP
43 #define ASSERT_READ_LOCK(x)
45 MODULE_LICENSE("GPL");
46 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
47 MODULE_DESCRIPTION("iptables target for CLUSTERIP");
49 struct clusterip_config
{
50 struct list_head list
; /* list of all configs */
51 atomic_t refcount
; /* reference count */
53 u_int32_t clusterip
; /* the IP address */
54 u_int8_t clustermac
[ETH_ALEN
]; /* the MAC address */
55 struct net_device
*dev
; /* device */
56 u_int16_t num_total_nodes
; /* total number of nodes */
57 u_int16_t num_local_nodes
; /* number of local nodes */
58 u_int16_t local_nodes
[CLUSTERIP_MAX_NODES
]; /* node number array */
61 struct proc_dir_entry
*pde
; /* proc dir entry */
63 enum clusterip_hashmode hash_mode
; /* which hashing mode */
64 u_int32_t hash_initval
; /* hash initialization */
67 static LIST_HEAD(clusterip_configs
);
69 /* clusterip_lock protects the clusterip_configs list _AND_ the configurable
70 * data within all structurses (num_local_nodes, local_nodes[]) */
71 static DEFINE_RWLOCK(clusterip_lock
);
74 static struct file_operations clusterip_proc_fops
;
75 static struct proc_dir_entry
*clusterip_procdir
;
79 clusterip_config_get(struct clusterip_config
*c
) {
80 atomic_inc(&c
->refcount
);
84 clusterip_config_put(struct clusterip_config
*c
) {
85 if (atomic_dec_and_test(&c
->refcount
)) {
86 write_lock_bh(&clusterip_lock
);
88 write_unlock_bh(&clusterip_lock
);
89 dev_mc_delete(c
->dev
, c
->clustermac
, ETH_ALEN
, 0);
96 static struct clusterip_config
*
97 __clusterip_config_find(u_int32_t clusterip
)
99 struct list_head
*pos
;
101 ASSERT_READ_LOCK(&clusterip_lock
);
102 list_for_each(pos
, &clusterip_configs
) {
103 struct clusterip_config
*c
= list_entry(pos
,
104 struct clusterip_config
, list
);
105 if (c
->clusterip
== clusterip
) {
113 static inline struct clusterip_config
*
114 clusterip_config_find_get(u_int32_t clusterip
)
116 struct clusterip_config
*c
;
118 read_lock_bh(&clusterip_lock
);
119 c
= __clusterip_config_find(clusterip
);
121 read_unlock_bh(&clusterip_lock
);
124 atomic_inc(&c
->refcount
);
125 read_unlock_bh(&clusterip_lock
);
130 static struct clusterip_config
*
131 clusterip_config_init(struct ipt_clusterip_tgt_info
*i
, u_int32_t ip
,
132 struct net_device
*dev
)
134 struct clusterip_config
*c
;
137 c
= kmalloc(sizeof(*c
), GFP_ATOMIC
);
141 memset(c
, 0, sizeof(*c
));
144 memcpy(&c
->clustermac
, &i
->clustermac
, ETH_ALEN
);
145 c
->num_total_nodes
= i
->num_total_nodes
;
146 c
->num_local_nodes
= i
->num_local_nodes
;
147 memcpy(&c
->local_nodes
, &i
->local_nodes
, sizeof(&c
->local_nodes
));
148 c
->hash_mode
= i
->hash_mode
;
149 c
->hash_initval
= i
->hash_initval
;
150 atomic_set(&c
->refcount
, 1);
152 #ifdef CONFIG_PROC_FS
153 /* create proc dir entry */
154 sprintf(buffer
, "%u.%u.%u.%u", NIPQUAD(ip
));
155 c
->pde
= create_proc_entry(buffer
, S_IWUSR
|S_IRUSR
, clusterip_procdir
);
160 c
->pde
->proc_fops
= &clusterip_proc_fops
;
164 write_lock_bh(&clusterip_lock
);
165 list_add(&c
->list
, &clusterip_configs
);
166 write_unlock_bh(&clusterip_lock
);
172 clusterip_add_node(struct clusterip_config
*c
, u_int16_t nodenum
)
176 write_lock_bh(&clusterip_lock
);
178 if (c
->num_local_nodes
>= CLUSTERIP_MAX_NODES
179 || nodenum
> CLUSTERIP_MAX_NODES
) {
180 write_unlock_bh(&clusterip_lock
);
184 /* check if we alrady have this number in our array */
185 for (i
= 0; i
< c
->num_local_nodes
; i
++) {
186 if (c
->local_nodes
[i
] == nodenum
) {
187 write_unlock_bh(&clusterip_lock
);
192 c
->local_nodes
[c
->num_local_nodes
++] = nodenum
;
194 write_unlock_bh(&clusterip_lock
);
199 clusterip_del_node(struct clusterip_config
*c
, u_int16_t nodenum
)
203 write_lock_bh(&clusterip_lock
);
205 if (c
->num_local_nodes
<= 1 || nodenum
> CLUSTERIP_MAX_NODES
) {
206 write_unlock_bh(&clusterip_lock
);
210 for (i
= 0; i
< c
->num_local_nodes
; i
++) {
211 if (c
->local_nodes
[i
] == nodenum
) {
212 int size
= sizeof(u_int16_t
)*(c
->num_local_nodes
-(i
+1));
213 memmove(&c
->local_nodes
[i
], &c
->local_nodes
[i
+1], size
);
214 c
->num_local_nodes
--;
215 write_unlock_bh(&clusterip_lock
);
220 write_unlock_bh(&clusterip_lock
);
224 static inline u_int32_t
225 clusterip_hashfn(struct sk_buff
*skb
, struct clusterip_config
*config
)
227 struct iphdr
*iph
= skb
->nh
.iph
;
228 unsigned long hashval
;
229 u_int16_t sport
, dport
;
234 switch (iph
->protocol
) {
236 th
= (void *)iph
+iph
->ihl
*4;
237 sport
= ntohs(th
->source
);
238 dport
= ntohs(th
->dest
);
241 uh
= (void *)iph
+iph
->ihl
*4;
242 sport
= ntohs(uh
->source
);
243 dport
= ntohs(uh
->dest
);
246 ih
= (void *)iph
+iph
->ihl
*4;
247 sport
= ntohs(ih
->un
.echo
.id
);
248 dport
= (ih
->type
<<8)|ih
->code
;
251 if (net_ratelimit()) {
252 printk(KERN_NOTICE
"CLUSTERIP: unknown protocol `%u'\n",
258 switch (config
->hash_mode
) {
259 case CLUSTERIP_HASHMODE_SIP
:
260 hashval
= jhash_1word(ntohl(iph
->saddr
),
261 config
->hash_initval
);
263 case CLUSTERIP_HASHMODE_SIP_SPT
:
264 hashval
= jhash_2words(ntohl(iph
->saddr
), sport
,
265 config
->hash_initval
);
267 case CLUSTERIP_HASHMODE_SIP_SPT_DPT
:
268 hashval
= jhash_3words(ntohl(iph
->saddr
), sport
, dport
,
269 config
->hash_initval
);
272 /* to make gcc happy */
274 /* This cannot happen, unless the check function wasn't called
275 * at rule load time */
276 printk("CLUSTERIP: unknown mode `%u'\n", config
->hash_mode
);
281 /* node numbers are 1..n, not 0..n */
282 return ((hashval
% config
->num_total_nodes
)+1);
286 clusterip_responsible(struct clusterip_config
*config
, u_int32_t hash
)
290 read_lock_bh(&clusterip_lock
);
292 if (config
->num_local_nodes
== 0) {
293 read_unlock_bh(&clusterip_lock
);
297 for (i
= 0; i
< config
->num_local_nodes
; i
++) {
298 if (config
->local_nodes
[i
] == hash
) {
299 read_unlock_bh(&clusterip_lock
);
304 read_unlock_bh(&clusterip_lock
);
309 /***********************************************************************
311 ***********************************************************************/
314 target(struct sk_buff
**pskb
,
315 const struct net_device
*in
,
316 const struct net_device
*out
,
317 unsigned int hooknum
,
318 const void *targinfo
,
321 const struct ipt_clusterip_tgt_info
*cipinfo
= targinfo
;
322 enum ip_conntrack_info ctinfo
;
323 struct ip_conntrack
*ct
= ip_conntrack_get((*pskb
), &ctinfo
);
326 /* don't need to clusterip_config_get() here, since refcount
327 * is only decremented by destroy() - and ip_tables guarantees
328 * that the ->target() function isn't called after ->destroy() */
331 printk(KERN_ERR
"CLUSTERIP: no conntrack!\n");
332 /* FIXME: need to drop invalid ones, since replies
333 * to outgoing connections of other nodes will be
334 * marked as INVALID */
338 /* special case: ICMP error handling. conntrack distinguishes between
339 * error messages (RELATED) and information requests (see below) */
340 if ((*pskb
)->nh
.iph
->protocol
== IPPROTO_ICMP
341 && (ctinfo
== IP_CT_RELATED
342 || ctinfo
== IP_CT_RELATED
+IP_CT_IS_REPLY
))
345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
346 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
347 * on, which all have an ID field [relevant for hashing]. */
349 hash
= clusterip_hashfn(*pskb
, cipinfo
->config
);
356 case IP_CT_RELATED
+IP_CT_IS_REPLY
:
357 /* FIXME: we don't handle expectations at the
358 * moment. they can arrive on a different node than
359 * the master connection (e.g. FTP passive mode) */
360 case IP_CT_ESTABLISHED
:
361 case IP_CT_ESTABLISHED
+IP_CT_IS_REPLY
:
367 #ifdef DEBUG_CLUSTERP
368 DUMP_TUPLE(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
370 DEBUGP("hash=%u ct_hash=%lu ", hash
, ct
->mark
);
371 if (!clusterip_responsible(cipinfo
->config
, hash
)) {
372 DEBUGP("not responsible\n");
375 DEBUGP("responsible\n");
377 /* despite being received via linklayer multicast, this is
378 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
379 (*pskb
)->pkt_type
= PACKET_HOST
;
385 checkentry(const char *tablename
,
386 const struct ipt_entry
*e
,
388 unsigned int targinfosize
,
389 unsigned int hook_mask
)
391 struct ipt_clusterip_tgt_info
*cipinfo
= targinfo
;
393 struct clusterip_config
*config
;
395 if (targinfosize
!= IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info
))) {
396 printk(KERN_WARNING
"CLUSTERIP: targinfosize %u != %Zu\n",
398 IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info
)));
402 if (cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP
&&
403 cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP_SPT
&&
404 cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP_SPT_DPT
) {
405 printk(KERN_WARNING
"CLUSTERIP: unknown mode `%u'\n",
410 if (e
->ip
.dmsk
.s_addr
!= 0xffffffff
411 || e
->ip
.dst
.s_addr
== 0) {
412 printk(KERN_ERR
"CLUSTERIP: Please specify destination IP\n");
416 /* FIXME: further sanity checks */
418 config
= clusterip_config_find_get(e
->ip
.dst
.s_addr
);
420 if (!(cipinfo
->flags
& CLUSTERIP_FLAG_NEW
)) {
421 printk(KERN_WARNING
"CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e
->ip
.dst
.s_addr
));
424 struct net_device
*dev
;
426 if (e
->ip
.iniface
[0] == '\0') {
427 printk(KERN_WARNING
"CLUSTERIP: Please specify an interface name\n");
431 dev
= dev_get_by_name(e
->ip
.iniface
);
433 printk(KERN_WARNING
"CLUSTERIP: no such interface %s\n", e
->ip
.iniface
);
437 config
= clusterip_config_init(cipinfo
,
438 e
->ip
.dst
.s_addr
, dev
);
440 printk(KERN_WARNING
"CLUSTERIP: cannot allocate config\n");
444 dev_mc_add(config
->dev
,config
->clustermac
, ETH_ALEN
, 0);
448 cipinfo
->config
= config
;
453 /* drop reference count of cluster config when rule is deleted */
454 static void destroy(void *matchinfo
, unsigned int matchinfosize
)
456 struct ipt_clusterip_tgt_info
*cipinfo
= matchinfo
;
458 /* we first remove the proc entry and then drop the reference
459 * count. In case anyone still accesses the file, the open/close
460 * functions are also incrementing the refcount on their own */
461 #ifdef CONFIG_PROC_FS
462 remove_proc_entry(cipinfo
->config
->pde
->name
,
463 cipinfo
->config
->pde
->parent
);
465 clusterip_config_put(cipinfo
->config
);
468 static struct ipt_target clusterip_tgt
= {
471 .checkentry
= &checkentry
,
477 /***********************************************************************
479 ***********************************************************************/
481 /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
483 u_int8_t src_hw
[ETH_ALEN
];
485 u_int8_t dst_hw
[ETH_ALEN
];
487 } __attribute__ ((packed
));
489 #ifdef CLUSTERIP_DEBUG
490 static void arp_print(struct arp_payload
*payload
)
492 #define HBUFFERLEN 30
493 char hbuffer
[HBUFFERLEN
];
495 const char hexbuf
[]= "0123456789abcdef";
497 for (k
=0, j
=0; k
< HBUFFERLEN
-3 && j
< ETH_ALEN
; j
++) {
498 hbuffer
[k
++]=hexbuf
[(payload
->src_hw
[j
]>>4)&15];
499 hbuffer
[k
++]=hexbuf
[payload
->src_hw
[j
]&15];
504 printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n",
505 NIPQUAD(payload
->src_ip
), hbuffer
,
506 NIPQUAD(payload
->dst_ip
));
511 arp_mangle(unsigned int hook
,
512 struct sk_buff
**pskb
,
513 const struct net_device
*in
,
514 const struct net_device
*out
,
515 int (*okfn
)(struct sk_buff
*))
517 struct arphdr
*arp
= (*pskb
)->nh
.arph
;
518 struct arp_payload
*payload
;
519 struct clusterip_config
*c
;
521 /* we don't care about non-ethernet and non-ipv4 ARP */
522 if (arp
->ar_hrd
!= htons(ARPHRD_ETHER
)
523 || arp
->ar_pro
!= htons(ETH_P_IP
)
524 || arp
->ar_pln
!= 4 || arp
->ar_hln
!= ETH_ALEN
)
527 /* we only want to mangle arp requests and replies */
528 if (arp
->ar_op
!= htons(ARPOP_REPLY
)
529 && arp
->ar_op
!= htons(ARPOP_REQUEST
))
532 payload
= (void *)(arp
+1);
534 /* if there is no clusterip configuration for the arp reply's
535 * source ip, we don't want to mangle it */
536 c
= clusterip_config_find_get(payload
->src_ip
);
540 /* normally the linux kernel always replies to arp queries of
541 * addresses on different interfacs. However, in the CLUSTERIP case
542 * this wouldn't work, since we didn't subscribe the mcast group on
543 * other interfaces */
545 DEBUGP("CLUSTERIP: not mangling arp reply on different "
546 "interface: cip'%s'-skb'%s'\n", c
->dev
->name
, out
->name
);
547 clusterip_config_put(c
);
551 /* mangle reply hardware address */
552 memcpy(payload
->src_hw
, c
->clustermac
, arp
->ar_hln
);
554 #ifdef CLUSTERIP_DEBUG
555 DEBUGP(KERN_DEBUG
"CLUSTERIP mangled arp reply: ");
559 clusterip_config_put(c
);
564 static struct nf_hook_ops cip_arp_ops
= {
567 .hooknum
= NF_ARP_OUT
,
571 /***********************************************************************
573 ***********************************************************************/
575 #ifdef CONFIG_PROC_FS
577 static void *clusterip_seq_start(struct seq_file
*s
, loff_t
*pos
)
579 struct proc_dir_entry
*pde
= s
->private;
580 struct clusterip_config
*c
= pde
->data
;
581 unsigned int *nodeidx
;
583 read_lock_bh(&clusterip_lock
);
584 if (*pos
>= c
->num_local_nodes
)
587 nodeidx
= kmalloc(sizeof(unsigned int), GFP_KERNEL
);
589 return ERR_PTR(-ENOMEM
);
595 static void *clusterip_seq_next(struct seq_file
*s
, void *v
, loff_t
*pos
)
597 struct proc_dir_entry
*pde
= s
->private;
598 struct clusterip_config
*c
= pde
->data
;
599 unsigned int *nodeidx
= (unsigned int *)v
;
602 if (*pos
>= c
->num_local_nodes
) {
609 static void clusterip_seq_stop(struct seq_file
*s
, void *v
)
613 read_unlock_bh(&clusterip_lock
);
616 static int clusterip_seq_show(struct seq_file
*s
, void *v
)
618 struct proc_dir_entry
*pde
= s
->private;
619 struct clusterip_config
*c
= pde
->data
;
620 unsigned int *nodeidx
= (unsigned int *)v
;
624 seq_printf(s
, "%u", c
->local_nodes
[*nodeidx
]);
626 if (*nodeidx
== c
->num_local_nodes
-1)
632 static struct seq_operations clusterip_seq_ops
= {
633 .start
= clusterip_seq_start
,
634 .next
= clusterip_seq_next
,
635 .stop
= clusterip_seq_stop
,
636 .show
= clusterip_seq_show
,
639 static int clusterip_proc_open(struct inode
*inode
, struct file
*file
)
641 int ret
= seq_open(file
, &clusterip_seq_ops
);
644 struct seq_file
*sf
= file
->private_data
;
645 struct proc_dir_entry
*pde
= PDE(inode
);
646 struct clusterip_config
*c
= pde
->data
;
650 clusterip_config_get(c
);
656 static int clusterip_proc_release(struct inode
*inode
, struct file
*file
)
658 struct proc_dir_entry
*pde
= PDE(inode
);
659 struct clusterip_config
*c
= pde
->data
;
662 ret
= seq_release(inode
, file
);
665 clusterip_config_put(c
);
670 static ssize_t
clusterip_proc_write(struct file
*file
, const char __user
*input
,
671 size_t size
, loff_t
*ofs
)
673 #define PROC_WRITELEN 10
674 char buffer
[PROC_WRITELEN
+1];
675 struct proc_dir_entry
*pde
= PDE(file
->f_dentry
->d_inode
);
676 struct clusterip_config
*c
= pde
->data
;
677 unsigned long nodenum
;
679 if (copy_from_user(buffer
, input
, PROC_WRITELEN
))
682 if (*buffer
== '+') {
683 nodenum
= simple_strtoul(buffer
+1, NULL
, 10);
684 if (clusterip_add_node(c
, nodenum
))
686 } else if (*buffer
== '-') {
687 nodenum
= simple_strtoul(buffer
+1, NULL
,10);
688 if (clusterip_del_node(c
, nodenum
))
696 static struct file_operations clusterip_proc_fops
= {
697 .owner
= THIS_MODULE
,
698 .open
= clusterip_proc_open
,
700 .write
= clusterip_proc_write
,
702 .release
= clusterip_proc_release
,
705 #endif /* CONFIG_PROC_FS */
707 static int init_or_cleanup(int fini
)
714 if (ipt_register_target(&clusterip_tgt
)) {
719 if (nf_register_hook(&cip_arp_ops
) < 0) {
724 #ifdef CONFIG_PROC_FS
725 clusterip_procdir
= proc_mkdir("ipt_CLUSTERIP", proc_net
);
726 if (!clusterip_procdir
) {
727 printk(KERN_ERR
"CLUSTERIP: Unable to proc dir entry\n");
731 #endif /* CONFIG_PROC_FS */
733 printk(KERN_NOTICE
"ClusterIP Version %s loaded successfully\n",
739 printk(KERN_NOTICE
"ClusterIP Version %s unloading\n",
741 #ifdef CONFIG_PROC_FS
742 remove_proc_entry(clusterip_procdir
->name
, clusterip_procdir
->parent
);
745 nf_unregister_hook(&cip_arp_ops
);
747 ipt_unregister_target(&clusterip_tgt
);
752 static int __init
init(void)
754 return init_or_cleanup(0);
757 static void __exit
fini(void)