4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "system/network.h"
25 #include "lib/util/debug.h"
26 #include "common/logging.h"
28 #include "protocol/protocol_util.h"
30 #include "server/ipalloc_private.h"
33 * This is the length of the longtest common prefix between the IPs.
34 * It is calculated by XOR-ing the 2 IPs together and counting the
35 * number of leading zeroes. The implementation means that all
36 * addresses end up being 128 bits long.
38 * FIXME? Should we consider IPv4 and IPv6 separately given that the
39 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
40 * lots of nodes and IP addresses?
42 static uint32_t ip_distance(ctdb_sock_addr
*ip1
, ctdb_sock_addr
*ip2
)
44 uint32_t ip1_k
[IP_KEYLEN
];
49 uint32_t distance
= 0;
51 memcpy(ip1_k
, ip_key(ip1
), sizeof(ip1_k
));
53 for (i
=0; i
<IP_KEYLEN
; i
++) {
58 /* Count number of leading zeroes.
59 * FIXME? This could be optimised...
61 while ((x
& ((uint32_t)1 << 31)) == 0) {
71 /* Calculate the IP distance for the given IP relative to IPs on the
72 given node. The ips argument is generally the all_ips variable
73 used in the main part of the algorithm.
75 static uint32_t ip_distance_2_sum(ctdb_sock_addr
*ip
,
76 struct public_ip_list
*ips
,
79 struct public_ip_list
*t
;
84 for (t
= ips
; t
!= NULL
; t
= t
->next
) {
89 /* Optimisation: We never calculate the distance
90 * between an address and itself. This allows us to
91 * calculate the effect of removing an address from a
92 * node by simply calculating the distance between
93 * that address and all of the existing addresses.
94 * Moreover, we assume that we're only ever dealing
95 * with addresses from all_ips so we can identify an
96 * address via a pointer rather than doing a more
97 * expensive address comparison. */
98 if (&(t
->addr
) == ip
) {
102 d
= ip_distance(ip
, &(t
->addr
));
103 sum
+= d
* d
; /* Cheaper than pulling in math.h :-) */
109 /* Return the LCP2 imbalance metric for addresses currently assigned
112 static uint32_t lcp2_imbalance(struct public_ip_list
* all_ips
,
115 struct public_ip_list
*t
;
117 uint32_t imbalance
= 0;
119 for (t
= all_ips
; t
!= NULL
; t
= t
->next
) {
123 /* Pass the rest of the IPs rather than the whole
126 imbalance
+= ip_distance_2_sum(&(t
->addr
), t
->next
, pnn
);
132 static bool lcp2_init(struct ipalloc_state
*ipalloc_state
,
133 uint32_t **lcp2_imbalances
,
134 bool **rebalance_candidates
)
136 unsigned int i
, numnodes
;
137 struct public_ip_list
*t
;
139 numnodes
= ipalloc_state
->num
;
141 *rebalance_candidates
= talloc_array(ipalloc_state
, bool, numnodes
);
142 if (*rebalance_candidates
== NULL
) {
143 DEBUG(DEBUG_ERR
, (__location__
" out of memory\n"));
146 *lcp2_imbalances
= talloc_array(ipalloc_state
, uint32_t, numnodes
);
147 if (*lcp2_imbalances
== NULL
) {
148 DEBUG(DEBUG_ERR
, (__location__
" out of memory\n"));
152 for (i
=0; i
<numnodes
; i
++) {
153 (*lcp2_imbalances
)[i
] =
154 lcp2_imbalance(ipalloc_state
->all_ips
, i
);
155 /* First step: assume all nodes are candidates */
156 (*rebalance_candidates
)[i
] = true;
159 /* 2nd step: if a node has IPs assigned then it must have been
160 * healthy before, so we remove it from consideration. This
161 * is overkill but is all we have because we don't maintain
162 * state between takeover runs. An alternative would be to
163 * keep state and invalidate it every time the recovery master
166 for (t
= ipalloc_state
->all_ips
; t
!= NULL
; t
= t
->next
) {
167 if (t
->pnn
!= CTDB_UNKNOWN_PNN
) {
168 (*rebalance_candidates
)[t
->pnn
] = false;
172 /* 3rd step: if a node is forced to re-balance then
173 we allow failback onto the node */
174 if (ipalloc_state
->force_rebalance_nodes
== NULL
) {
178 i
< talloc_array_length(ipalloc_state
->force_rebalance_nodes
);
180 uint32_t pnn
= ipalloc_state
->force_rebalance_nodes
[i
];
181 if (pnn
>= numnodes
) {
183 (__location__
"unknown node %u\n", pnn
));
188 ("Forcing rebalancing of IPs to node %u\n", pnn
));
189 (*rebalance_candidates
)[pnn
] = true;
195 /* Allocate any unassigned addresses using the LCP2 algorithm to find
196 * the IP/node combination that will cost the least.
198 static void lcp2_allocate_unassigned(struct ipalloc_state
*ipalloc_state
,
199 uint32_t *lcp2_imbalances
)
201 struct public_ip_list
*t
;
202 unsigned int dstnode
, numnodes
;
204 unsigned int minnode
;
205 uint32_t mindsum
, dstdsum
, dstimbl
;
206 uint32_t minimbl
= 0;
207 struct public_ip_list
*minip
;
209 bool should_loop
= true;
210 bool have_unassigned
= true;
212 numnodes
= ipalloc_state
->num
;
214 while (have_unassigned
&& should_loop
) {
217 DEBUG(DEBUG_DEBUG
,(" ----------------------------------------\n"));
218 DEBUG(DEBUG_DEBUG
,(" CONSIDERING MOVES (UNASSIGNED)\n"));
220 minnode
= CTDB_UNKNOWN_PNN
;
224 /* loop over each unassigned ip. */
225 for (t
= ipalloc_state
->all_ips
; t
!= NULL
; t
= t
->next
) {
226 if (t
->pnn
!= CTDB_UNKNOWN_PNN
) {
230 for (dstnode
= 0; dstnode
< numnodes
; dstnode
++) {
231 /* only check nodes that can actually takeover this ip */
232 if (!can_node_takeover_ip(ipalloc_state
,
235 /* no it couldn't so skip to the next node */
239 dstdsum
= ip_distance_2_sum(&(t
->addr
),
240 ipalloc_state
->all_ips
,
242 dstimbl
= lcp2_imbalances
[dstnode
] + dstdsum
;
244 (" %s -> %d [+%d]\n",
245 ctdb_sock_addr_to_string(ipalloc_state
,
249 dstimbl
- lcp2_imbalances
[dstnode
]));
252 if (minnode
== CTDB_UNKNOWN_PNN
||
263 DEBUG(DEBUG_DEBUG
,(" ----------------------------------------\n"));
265 /* If we found one then assign it to the given node. */
266 if (minnode
!= CTDB_UNKNOWN_PNN
) {
267 minip
->pnn
= minnode
;
268 lcp2_imbalances
[minnode
] = minimbl
;
269 DEBUG(DEBUG_INFO
,(" %s -> %d [+%d]\n",
270 ctdb_sock_addr_to_string(
272 &(minip
->addr
), false),
277 /* There might be a better way but at least this is clear. */
278 have_unassigned
= false;
279 for (t
= ipalloc_state
->all_ips
; t
!= NULL
; t
= t
->next
) {
280 if (t
->pnn
== CTDB_UNKNOWN_PNN
) {
281 have_unassigned
= true;
286 /* We know if we have an unassigned addresses so we might as
289 if (have_unassigned
) {
290 for (t
= ipalloc_state
->all_ips
; t
!= NULL
; t
= t
->next
) {
291 if (t
->pnn
== CTDB_UNKNOWN_PNN
) {
293 ("Failed to find node to cover ip %s\n",
294 ctdb_sock_addr_to_string(ipalloc_state
,
302 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
303 * to move IPs from, determines the best IP/destination node
304 * combination to move from the source node.
306 static bool lcp2_failback_candidate(struct ipalloc_state
*ipalloc_state
,
307 unsigned int srcnode
,
308 uint32_t *lcp2_imbalances
,
309 bool *rebalance_candidates
)
311 unsigned int dstnode
, mindstnode
, numnodes
;
312 uint32_t srcdsum
, dstimbl
, dstdsum
;
313 uint32_t minsrcimbl
, mindstimbl
;
314 struct public_ip_list
*minip
;
315 struct public_ip_list
*t
;
317 /* Find an IP and destination node that best reduces imbalance. */
320 mindstnode
= CTDB_UNKNOWN_PNN
;
323 numnodes
= ipalloc_state
->num
;
325 DEBUG(DEBUG_DEBUG
,(" ----------------------------------------\n"));
326 DEBUG(DEBUG_DEBUG
,(" CONSIDERING MOVES FROM %d [%d]\n",
327 srcnode
, lcp2_imbalances
[srcnode
]));
329 for (t
= ipalloc_state
->all_ips
; t
!= NULL
; t
= t
->next
) {
332 /* Only consider addresses on srcnode. */
333 if (t
->pnn
!= srcnode
) {
337 /* What is this IP address costing the source node? */
338 srcdsum
= ip_distance_2_sum(&(t
->addr
),
339 ipalloc_state
->all_ips
,
341 srcimbl
= lcp2_imbalances
[srcnode
] - srcdsum
;
343 /* Consider this IP address would cost each potential
344 * destination node. Destination nodes are limited to
345 * those that are newly healthy, since we don't want
346 * to do gratuitous failover of IPs just to make minor
347 * balance improvements.
349 for (dstnode
= 0; dstnode
< numnodes
; dstnode
++) {
350 if (!rebalance_candidates
[dstnode
]) {
354 /* only check nodes that can actually takeover this ip */
355 if (!can_node_takeover_ip(ipalloc_state
, dstnode
,
357 /* no it couldn't so skip to the next node */
361 dstdsum
= ip_distance_2_sum(&(t
->addr
),
362 ipalloc_state
->all_ips
,
364 dstimbl
= lcp2_imbalances
[dstnode
] + dstdsum
;
365 DEBUG(DEBUG_DEBUG
,(" %d [%d] -> %s -> %d [+%d]\n",
367 ctdb_sock_addr_to_string(
372 if ((dstimbl
< lcp2_imbalances
[srcnode
]) &&
373 (dstdsum
< srcdsum
) && \
374 ((mindstnode
== CTDB_UNKNOWN_PNN
) || \
375 ((srcimbl
+ dstimbl
) < (minsrcimbl
+ mindstimbl
)))) {
378 minsrcimbl
= srcimbl
;
379 mindstnode
= dstnode
;
380 mindstimbl
= dstimbl
;
384 DEBUG(DEBUG_DEBUG
,(" ----------------------------------------\n"));
386 if (mindstnode
!= CTDB_UNKNOWN_PNN
) {
387 /* We found a move that makes things better... */
389 ("%d [%d] -> %s -> %d [+%d]\n",
390 srcnode
, minsrcimbl
- lcp2_imbalances
[srcnode
],
391 ctdb_sock_addr_to_string(ipalloc_state
,
392 &(minip
->addr
), false),
393 mindstnode
, mindstimbl
- lcp2_imbalances
[mindstnode
]));
396 lcp2_imbalances
[srcnode
] = minsrcimbl
;
397 lcp2_imbalances
[mindstnode
] = mindstimbl
;
398 minip
->pnn
= mindstnode
;
406 struct lcp2_imbalance_pnn
{
411 static int lcp2_cmp_imbalance_pnn(const void * a
, const void * b
)
413 const struct lcp2_imbalance_pnn
* lipa
= (const struct lcp2_imbalance_pnn
*) a
;
414 const struct lcp2_imbalance_pnn
* lipb
= (const struct lcp2_imbalance_pnn
*) b
;
416 if (lipa
->imbalance
> lipb
->imbalance
) {
418 } else if (lipa
->imbalance
== lipb
->imbalance
) {
425 /* LCP2 algorithm for rebalancing the cluster. This finds the source
426 * node with the highest LCP2 imbalance, and then determines the best
427 * IP/destination node combination to move from the source node.
429 static void lcp2_failback(struct ipalloc_state
*ipalloc_state
,
430 uint32_t *lcp2_imbalances
,
431 bool *rebalance_candidates
)
434 struct lcp2_imbalance_pnn
* lips
;
437 numnodes
= ipalloc_state
->num
;
440 /* Put the imbalances and nodes into an array, sort them and
441 * iterate through candidates. Usually the 1st one will be
442 * used, so this doesn't cost much...
444 DEBUG(DEBUG_DEBUG
,("+++++++++++++++++++++++++++++++++++++++++\n"));
445 DEBUG(DEBUG_DEBUG
,("Selecting most imbalanced node from:\n"));
446 lips
= talloc_array(ipalloc_state
, struct lcp2_imbalance_pnn
, numnodes
);
447 for (i
= 0; i
< numnodes
; i
++) {
448 lips
[i
].imbalance
= lcp2_imbalances
[i
];
450 DEBUG(DEBUG_DEBUG
,(" %d [%d]\n", i
, lcp2_imbalances
[i
]));
452 qsort(lips
, numnodes
, sizeof(struct lcp2_imbalance_pnn
),
453 lcp2_cmp_imbalance_pnn
);
456 for (i
= 0; i
< numnodes
; i
++) {
457 /* This means that all nodes had 0 or 1 addresses, so
458 * can't be imbalanced.
460 if (lips
[i
].imbalance
== 0) {
464 if (lcp2_failback_candidate(ipalloc_state
,
467 rebalance_candidates
)) {
479 bool ipalloc_lcp2(struct ipalloc_state
*ipalloc_state
)
481 uint32_t *lcp2_imbalances
;
482 bool *rebalance_candidates
;
484 bool have_rebalance_candidates
;
487 unassign_unsuitable_ips(ipalloc_state
);
489 if (!lcp2_init(ipalloc_state
,
490 &lcp2_imbalances
, &rebalance_candidates
)) {
495 lcp2_allocate_unassigned(ipalloc_state
, lcp2_imbalances
);
497 /* If we don't want IPs to fail back then don't rebalance IPs. */
498 if (ipalloc_state
->no_ip_failback
) {
502 /* It is only worth continuing if we have suitable target
503 * nodes to transfer IPs to. This check is much cheaper than
506 numnodes
= ipalloc_state
->num
;
507 have_rebalance_candidates
= false;
508 for (i
=0; i
<numnodes
; i
++) {
509 if (rebalance_candidates
[i
]) {
510 have_rebalance_candidates
= true;
514 if (!have_rebalance_candidates
) {
518 /* Now, try to make sure the ip addresses are evenly distributed
521 lcp2_failback(ipalloc_state
, lcp2_imbalances
, rebalance_candidates
);