4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * These functions implement the process of commitment for a pool
30 * configuration. This process can be described as taking instructions
31 * from a static configuration file and using the information about
32 * the target system contained in the dynamic configuration to make
33 * decisions about how best to allocate resources to meet the
34 * constraints specified in the static configuration file.
36 * Mechanically, this process relies upon ordering the individual
37 * components of the file and stepping through the lists of components
38 * and taking actions depending on their type and which file they are
41 * Configuration components can be broken down into different types
42 * which are then treated according to the following table:
46 * res_comp || res_agg If the element is a required element, then create or
47 * update it (don't destroy required elements in the
48 * static configuration) otherwise manipulate the
49 * dynamic configuration to create, destroy or update
50 * the element on the system.
51 * comp Create, destroy or update the static configuration
54 * The treatment of the different elements reflects the fact that all
55 * elements other than comp are configurable and thus libpool can
56 * create, destroy and modify these elements at will. comp elements
57 * reflect the disposition of the system, these elements can be moved
58 * around but they can't be created or destroyed in the dynamic
59 * configuration in the commit process. comp elements can be created
60 * and destroyed in the static configuration file as a result of a
61 * commit operation, since it's possible for a comp to not appear in
62 * the dynamic configuration. For instance, if the static
63 * configuration file was created on a different machine or after a DR
64 * operation which has removed or added components.
70 #include <sys/types.h>
77 #include "pool_internal.h"
78 #include "pool_impl.h"
80 #define MIN(x, y) ((x) < (y) ? (x) : (y))
81 #define MAX(x, y) ((x) > (y) ? (x) : (y))
82 #define POA_IMPORTANCE_NUM 0
83 #define POA_SURPLUS_TO_DEFAULT_NUM 1
86 * This resource specific structure is used to determine allocation of resources
87 * during resource set allocation. Each set will receive its min, plus
88 * some number of dealt resources based on the global allocation policy.
90 typedef struct res_info
{
91 pool_resource_t
*ri_res
; /* Resource set */
92 uint64_t ri_min
; /* Resource set's low watermark */
93 uint64_t ri_max
; /* Resource set's high watermark */
94 uint64_t ri_oldsize
; /* Size of resource set at the start */
95 uint64_t ri_newsize
; /* New resource set size allocated */
96 uint64_t ri_pinned
; /* Count of pinned resources in set */
97 uint64_t ri_dealt
; /* Count of resources dealt to set */
98 int64_t ri_transfer
; /* oldsize - newsize */
99 /* The signed quantity of resources */
100 /* to tranfer into or out of this */
102 /* + transfer: tranfer resources out */
103 /* - transfer: tranfer resources in */
107 * diff_and_fix operations
109 static int commit_create(pool_conf_t
*, pool_elem_t
**);
110 static int commit_delete(pool_elem_t
*);
111 static int commit_update(pool_elem_t
*, pool_elem_t
*, int);
114 * configuration commit processing
116 static int diff_and_fix(pool_conf_t
*, pool_conf_t
*);
117 static int process_elem_lt(pool_elem_t
*, pool_conf_t
*);
118 static int process_elem_gt(pool_elem_t
*, pool_conf_t
*,
120 static int process_lists(int, pool_conf_t
*,
122 static pool_elem_t
**get_elem_list(const pool_conf_t
*, int, uint_t
*);
123 static int share_resources(pool_conf_t
*);
124 static int resource_allocate(const char *, pool_resource_t
**,
126 static int resource_allocate_default(pool_resource_t
**, uint_t
);
127 static int pset_allocate_imp(pool_resource_t
**, uint_t
);
128 static int resource_compare_by_descending_importance(const void *,
130 static int compute_size_to_transfer(const void *, const void *);
131 static int set_importance_cb(pool_conf_t
*, pool_t
*, void *);
132 static int unset_importance_cb(pool_conf_t
*, pool_t
*, void *);
133 static int add_importance_props(pool_conf_t
*);
134 static int remove_importance_props(pool_conf_t
*);
135 static int clone_element(pool_conf_t
*, pool_elem_t
*,
136 const char *, pool_value_t
*, void *);
137 static int clean_element(pool_conf_t
*, pool_elem_t
*,
138 const char *, pool_value_t
*, void *);
141 * commit_create() is used to create a configuration element upon the
142 * system. Since only pools and resource actually need to perform any
143 * action, other elements are ignored as a no-op.
146 commit_create(pool_conf_t
*conf
, pool_elem_t
**e1
)
148 pool_resource_t
*res
;
150 const char *res_type
;
151 pool_elem_t
*src
= *e1
;
152 uint64_t smin
, smax
, dmax
;
153 pool_value_t val
= POOL_VALUE_INITIALIZER
;
156 switch (pool_elem_class(src
)) {
157 case PEC_SYSTEM
: /* NO-OP */
160 name
= elem_get_name(src
);
161 if ((pool
= pool_create(conf
, name
)) == NULL
) {
167 * Now copy the properties from the original pool to the
170 if (pool_walk_properties(TO_CONF(src
), src
, TO_ELEM(pool
),
171 clone_element
) != PO_SUCCESS
)
174 * Add a pointer to the src element which can be
175 * updated with a sys_id when the sys_id is allocated
176 * to the created element.
178 pool_set_pair(TO_ELEM(pool
), src
);
183 name
= elem_get_name(src
);
184 res_type
= pool_elem_class_string(src
);
185 if ((res
= pool_resource_create(conf
, res_type
, name
)) ==
192 * Need to do some ordering of property updates.
193 * Compare the values of source min/max and
194 * destination min/max. If smin < dmax then update the
195 * smin first, else update the max first.
197 if (resource_get_min(pool_elem_res(src
), &smin
) != PO_SUCCESS
||
198 resource_get_max(pool_elem_res(src
), &smax
) != PO_SUCCESS
||
199 resource_get_max(res
, &dmax
) != PO_SUCCESS
)
202 pool_value_set_uint64(&val
, smin
);
203 if (pool_put_ns_property(TO_ELEM(res
), c_min_prop
,
207 pool_value_set_uint64(&val
, smax
);
208 if (pool_put_ns_property(TO_ELEM(res
), c_max_prop
,
213 * Now copy the properties from the original resource
216 if (pool_walk_properties(TO_CONF(src
), src
, TO_ELEM(res
),
217 clone_element
) != PO_SUCCESS
)
220 * Add a pointer to the src element which can be
221 * updated with a sys_id when the sys_id is allocated
222 * to the created element.
224 pool_set_pair(TO_ELEM(res
), src
);
227 case PEC_COMP
: /* NO-OP */
237 * commit_delete() is used to delete a configuration element upon the
238 * system. Since only pools and resources actually need to perform
239 * any action, other elements are ignored as a no-op.
242 commit_delete(pool_elem_t
*pe
)
244 pool_resource_t
*res
;
251 switch (pool_elem_class(pe
)) {
252 case PEC_SYSTEM
: /* NO-OP */
255 pool
= pool_elem_pool(pe
);
256 ret
= pool_destroy(TO_CONF(pe
), pool
);
260 res
= pool_elem_res(pe
);
261 ret
= pool_resource_destroy(TO_CONF(pe
), res
);
263 case PEC_COMP
: /* NO-OP */
272 * commit_update() is used to update a configuration element upon the
273 * system or in a static configuration file. The pass parameter
274 * governs whether properties are being updated or associations. In
275 * pass 0, properties are updated. If the element is of class
276 * PEC_COMP, then make sure that the element in the static
277 * configuration file is correctly located before proceeding with the
278 * update. Then, the element in the dynamic configuration file is
279 * updated. In pass 1, ie. pass != 0, any pool components have their
280 * associations updated in the dynamic configuration.
283 commit_update(pool_elem_t
*e1
, pool_elem_t
*e2
, int pass
)
286 pool_resource_t
*res1
;
287 pool_resource_t
*res2
;
288 if (pool_elem_class(e1
) == PEC_COMP
) {
289 res1
= pool_get_owning_resource(TO_CONF(e1
),
291 res2
= pool_get_owning_resource(TO_CONF(e2
),
293 if (pool_elem_compare_name(TO_ELEM(res1
),
294 TO_ELEM(res2
)) != 0) {
296 const pool_resource_t
*newres
;
297 pool_component_t
*comps
[2] = { NULL
};
299 comps
[0] = pool_elem_comp(e2
);
300 name
= elem_get_name(TO_ELEM(res1
));
301 newres
= pool_get_resource(TO_CONF(e2
),
302 pool_elem_class_string(TO_ELEM(res1
)),
307 dprintf("transferring: res, comp\n");
308 pool_elem_dprintf(TO_ELEM(newres
));
309 pool_elem_dprintf(e2
);
311 (void) pool_resource_xtransfer(TO_CONF(e2
),
312 res2
, (pool_resource_t
*)newres
, comps
);
315 if (pool_walk_properties(TO_CONF(e2
), e2
, NULL
,
316 clean_element
) != PO_SUCCESS
) {
320 * Need to do some ordering of property updates if the
321 * element to be updated is a resource. Compare the
322 * values of source min/max and destination
323 * min/max. If smin < dmax then update the smin first,
324 * else update the max first.
326 if (pool_elem_class(e1
) == PEC_RES_COMP
||
327 pool_elem_class(e1
) == PEC_RES_AGG
) {
328 uint64_t smin
, smax
, dmax
;
329 pool_value_t val
= POOL_VALUE_INITIALIZER
;
331 if (resource_get_min(pool_elem_res(e1
), &smin
) !=
333 resource_get_max(pool_elem_res(e1
), &smax
) !=
335 resource_get_max(pool_elem_res(e2
), &dmax
) !=
339 pool_value_set_uint64(&val
, smin
);
340 if (pool_put_ns_property(e2
, c_min_prop
,
344 pool_value_set_uint64(&val
, smax
);
345 if (pool_put_ns_property(e2
, c_max_prop
,
351 * This next couple of steps needs some
352 * explanation. The first walk, copies all the
353 * properties that are writeable from the static
354 * configuration to the dynamic configuration. The
355 * second walk copies all properties (writeable or
356 * not) from the dynamic configuration element back to
357 * the static configuration element. This ensures that
358 * updates from the static configuration element are
359 * correctly applied to the dynamic configuration and
360 * then the static configuration element is updated
361 * with the latest values of the read-only xproperties
362 * from the dynamic configuration element. The
363 * enforcing of permisssions is performed in
364 * clone_element by its choice of property
365 * manipulation function.
367 if (pool_walk_properties(TO_CONF(e1
), e1
, e2
, clone_element
) !=
371 if (pool_walk_properties(TO_CONF(e2
), e2
, e1
, clone_element
) !=
376 if (pool_elem_class(e1
) == PEC_POOL
) {
377 pool_resource_t
**rs
;
380 pool_value_t val
= POOL_VALUE_INITIALIZER
;
381 pool_value_t
*pvals
[] = { NULL
, NULL
};
384 if (pool_value_set_string(&val
, "pset") != PO_SUCCESS
||
385 pool_value_set_name(&val
, c_type
) != PO_SUCCESS
)
387 if ((rs
= pool_query_pool_resources(TO_CONF(e1
),
388 pool_elem_pool(e1
), &nelem
, pvals
)) != NULL
) {
389 for (i
= 0; i
< nelem
; i
++) {
390 const pool_resource_t
*tgt_res
;
392 elem_get_name(TO_ELEM(rs
[i
]));
394 if ((tgt_res
= pool_get_resource(
395 TO_CONF(e2
), pool_elem_class_string(
396 TO_ELEM(rs
[i
])), res_name
)) ==
398 tgt_res
= get_default_resource(
402 if (pool_associate(TO_CONF(e2
),
403 pool_elem_pool(e2
), tgt_res
) !=
417 * diff_and_fix() works out the differences between two configurations
418 * and modifies the state of the system to match the operations
419 * required to bring the two configurations into sync.
421 * Returns PO_SUCCESS/PO_FAIL.
424 diff_and_fix(pool_conf_t
*stc
, pool_conf_t
*dyn
)
427 * The ordering of the operations is significant, we must
428 * process the system element, then the pools elements, then
429 * the resource elements, then the pools elements again and
430 * finally the resource components.
433 * PEC_RES_COMP are the only type of resources
434 * currently. When PEC_RES_AGG resources are added they must
437 if (process_lists(PEC_SYSTEM
, stc
, dyn
, 0) != PO_SUCCESS
) {
440 if (process_lists(PEC_POOL
, stc
, dyn
, 0) != PO_SUCCESS
) {
443 if (process_lists(PEC_RES_COMP
, stc
, dyn
, 0) != PO_SUCCESS
) {
446 if (process_lists(PEC_COMP
, stc
, dyn
, 0) != PO_SUCCESS
) {
449 if (process_lists(PEC_POOL
, stc
, dyn
, 1) != PO_SUCCESS
) {
453 * Share the resources. It has to be called for both
454 * configurations to ensure that the configurations still look
457 if (share_resources(dyn
) != PO_SUCCESS
) {
460 if (share_resources(stc
) != PO_SUCCESS
) {
467 process_elem_lt(pool_elem_t
*pe
, pool_conf_t
*dyn
)
469 if (pool_elem_class(pe
) == PEC_COMP
) {
470 if (pool_component_destroy(pool_elem_comp(pe
)) == PO_FAIL
) {
473 } else if (! elem_is_default(pe
)) {
474 if (commit_create(dyn
, &pe
) != PO_SUCCESS
) {
482 process_elem_gt(pool_elem_t
*pe
, pool_conf_t
*stc
, pool_conf_t
*dyn
)
484 if (pool_elem_class(pe
) == PEC_COMP
) {
485 pool_resource_t
*owner
;
486 const pool_resource_t
*parent_res
;
487 pool_value_t val
= POOL_VALUE_INITIALIZER
;
488 const pool_component_t
*newcomp
;
492 * I have to find the right parent in the static
493 * configuration. It may not exist, in which case it's
494 * correct to put it in the default
496 owner
= pool_get_owning_resource(dyn
,
498 if (pool_get_ns_property(TO_ELEM(owner
), "name", &val
) ==
502 if (pool_value_get_string(&val
, &resname
) == PO_FAIL
)
505 if ((resname
= strdup(resname
)) == NULL
)
508 restype
= pool_elem_class_string(TO_ELEM(owner
));
509 parent_res
= pool_get_resource(stc
, restype
, resname
);
510 free((void *)resname
);
511 if (parent_res
== NULL
)
512 parent_res
= resource_by_sysid(stc
, PS_NONE
, restype
);
514 * Now need to make a copy of the component in the
515 * dynamic configuration in the static configuration.
517 if ((newcomp
= pool_component_create(stc
, parent_res
,
518 elem_get_sysid(pe
))) == NULL
)
521 if (pool_walk_properties(TO_CONF(pe
), pe
, TO_ELEM(newcomp
),
522 clone_element
) != PO_SUCCESS
)
524 } else if (elem_is_default(pe
)) {
525 pool_resource_t
*newres
;
529 if ((name
= elem_get_name(pe
)) == NULL
)
531 switch (pool_elem_class(pe
)) {
533 if ((newpool
= pool_create(stc
, name
)) == NULL
) {
538 if (pool_walk_properties(TO_CONF(pe
), pe
,
539 TO_ELEM(newpool
), clone_element
) != PO_SUCCESS
)
544 if ((newres
= pool_resource_create(stc
,
545 pool_elem_class_string(pe
), name
)) ==
551 if (pool_walk_properties(TO_CONF(pe
), pe
,
552 TO_ELEM(newres
), clone_element
) != PO_SUCCESS
)
560 if (commit_delete(pe
) != PO_SUCCESS
)
567 * This function compares the elements of the supplied type in the
568 * static and dynamic configurations supplied. The lists of elements
569 * are compared and used to create, delete and updated elements in
570 * both the static and dynamic configurations. The pass parameter is
571 * used to indicate to commit_update() whether property updates or
572 * association updates should be performed.
575 process_lists(int type
, pool_conf_t
*stc
, pool_conf_t
*dyn
, int pass
)
577 uint_t stc_nelem
= 0, dyn_nelem
= 0;
578 pool_elem_t
**stc_elems
, **dyn_elems
;
580 int status
= PO_SUCCESS
;
582 if ((stc_elems
= get_elem_list(stc
, type
, &stc_nelem
)) == NULL
)
585 qsort(stc_elems
, stc_nelem
, sizeof (pool_elem_t
*),
588 if ((dyn_elems
= get_elem_list(dyn
, type
, &dyn_nelem
)) == NULL
) {
593 qsort(dyn_elems
, dyn_nelem
, sizeof (pool_elem_t
*),
596 * Step through and do the updating, remember that we are
597 * comparing using the compare function for the configuration
601 while (status
== PO_SUCCESS
&& i
< stc_nelem
&& j
< dyn_nelem
) {
604 * We are going to do this by stepping through the static
607 if (elem_is_default(stc_elems
[i
]) &&
608 elem_is_default(dyn_elems
[j
]))
611 compare
= pool_elem_compare_name(stc_elems
[i
],
614 status
= process_elem_lt(stc_elems
[i
], dyn
);
616 } else if (compare
> 0) {
617 status
= process_elem_gt(dyn_elems
[j
], stc
, dyn
);
619 } else { /* compare == 0 */
620 if (commit_update(stc_elems
[i
], dyn_elems
[j
], pass
)
628 if (status
== PO_FAIL
) {
633 while (status
== PO_SUCCESS
&& i
< stc_nelem
) {
634 status
= process_elem_lt(stc_elems
[i
], dyn
);
637 if (status
== PO_FAIL
) {
642 while (status
== PO_SUCCESS
&& j
< dyn_nelem
) {
643 status
= process_elem_gt(dyn_elems
[j
], stc
, dyn
);
652 * get_elem_list() returns a list of pool_elem_t's. The size of the
653 * list is written into nelem. The list contains elements of all types
654 * that pools is interested in: i.e. system, pool, resources and
655 * resource components. It is the caller's responsibility to free the
656 * list when it is finished with.
658 * The array of pointers returned by the type specific query can be
659 * safely cast to be an array of pool_elem_t pointers. In the case of
660 * PEC_RES_COMP some additional processing is required to qualify the
663 * Returns a pointer to a list of pool_elem_t's or NULL on failure.
665 static pool_elem_t
**
666 get_elem_list(const pool_conf_t
*conf
, int type
, uint_t
*nelem
)
668 pool_resource_t
**rl
;
670 pool_component_t
**cl
;
671 pool_elem_t
**elems
= NULL
;
676 if ((elems
= malloc(sizeof (pool_elem_t
*))) == NULL
)
679 elems
[0] = pool_conf_to_elem(conf
);
682 if ((pl
= pool_query_pools(conf
, nelem
, NULL
)) != NULL
) {
683 elems
= (pool_elem_t
**)pl
;
687 if ((rl
= pool_query_resources(conf
, nelem
, NULL
)) != NULL
) {
689 elems
= (pool_elem_t
**)rl
;
690 for (i
= 0; i
< *nelem
; i
++) {
691 if (pool_elem_class(TO_ELEM(rl
[i
])) ==
693 elems
[j
++] = TO_ELEM(rl
[i
]);
699 if ((cl
= pool_query_components(conf
, nelem
, NULL
)) != NULL
) {
700 elems
= (pool_elem_t
**)cl
;
711 * share_resources() sets up the allocation of resources by each
712 * provider. Firstly all resources are updated with the importance of
713 * each pool, then each resource provider is invoked in turn with a
714 * list of it's own resources. Finally, the pool importance details
715 * are removed from the resources.
717 * Returns PO_SUCCESS/PO_FAIL
720 share_resources(pool_conf_t
*conf
)
722 pool_resource_t
**resources
;
724 pool_value_t
*props
[] = { NULL
, NULL
};
725 pool_value_t val
= POOL_VALUE_INITIALIZER
;
730 * Call an allocation function for each type of supported resource.
731 * This function is responsible for "sharing" resources to resource
732 * sets as determined by the system.allocate-method.
735 if (pool_value_set_string(props
[0], "pset") != PO_SUCCESS
||
736 pool_value_set_name(props
[0], c_type
) != PO_SUCCESS
)
739 if (add_importance_props(conf
) != PO_SUCCESS
) {
740 (void) remove_importance_props(conf
);
744 if ((resources
= pool_query_resources(conf
, &nelem
, props
)) != NULL
) {
746 * 'pool.importance' defines the importance of a pool;
747 * resources inherit the importance of the pool that
748 * is associated with them. If more than one pool is
749 * associated with a resource, the importance of the
750 * resource is the maximum importance of all
751 * associated pools. Use '_importance' on resources
752 * to determine who gets extra.
754 if (resource_allocate("pset", resources
, nelem
) != PO_SUCCESS
) {
756 (void) remove_importance_props(conf
);
761 (void) remove_importance_props(conf
);
767 * Work out which allocation method to use based on the value of the
768 * system.allocate-method property.
771 resource_allocate(const char *type
, pool_resource_t
**res
, uint_t nelem
)
774 const char *method_name
;
776 pool_value_t val
= POOL_VALUE_INITIALIZER
;
779 pe
= pool_conf_to_elem(TO_CONF(TO_ELEM(res
[0])));
781 if (pool_get_ns_property(pe
, "allocate-method", &val
) != POC_STRING
)
782 method_name
= POA_IMPORTANCE
;
784 (void) pool_value_get_string(&val
, &method_name
);
786 if (strcmp(POA_IMPORTANCE
, method_name
) != 0) {
787 if (strcmp(POA_SURPLUS_TO_DEFAULT
, method_name
) != 0) {
788 pool_seterror(POE_INVALID_CONF
);
791 method
= POA_SURPLUS_TO_DEFAULT_NUM
;
794 method
= POA_IMPORTANCE_NUM
;
797 case POA_IMPORTANCE_NUM
:
799 * TODO: Add support for new resource types
801 switch (pool_resource_elem_class_from_string(type
)) {
803 ret
= pset_allocate_imp(res
, nelem
);
810 case POA_SURPLUS_TO_DEFAULT_NUM
:
811 ret
= resource_allocate_default(res
, nelem
);
819 * Each set will get its minimum, however if there is more than the
820 * total minimum available, then leave this in the default set.
823 resource_allocate_default(pool_resource_t
**res
, uint_t nelem
)
825 res_info_t
*res_info
;
827 pool_resource_t
*default_res
= NULL
;
832 if ((res_info
= calloc(nelem
, sizeof (res_info_t
))) == NULL
) {
836 /* Load current resource values. */
837 for (j
= 0; j
< nelem
; j
++) {
839 if (default_res
== NULL
&&
840 resource_is_default(res
[j
]) == PO_TRUE
)
841 default_res
= res
[j
];
843 if (resource_get_max(res
[j
],
844 &res_info
[j
].ri_max
) == PO_FAIL
||
845 resource_get_min(res
[j
],
846 &res_info
[j
].ri_min
) == PO_FAIL
||
847 resource_get_size(res
[j
],
848 &res_info
[j
].ri_oldsize
) == PO_FAIL
||
849 resource_get_pinned(res
[j
],
850 &res_info
[j
].ri_pinned
) == PO_FAIL
) {
854 res_info
[j
].ri_res
= res
[j
];
858 * Firstly, for all resources that have size greater than min,
859 * transfer all movable size above min to the default resource.
861 for (j
= 0; j
< nelem
; j
++) {
865 /* compute the real minimum number of resources */
866 real_min
= MAX(res_info
[j
].ri_pinned
, res_info
[j
].ri_min
);
867 if (res_info
[j
].ri_res
!= default_res
&&
868 res_info
[j
].ri_oldsize
> real_min
) {
872 num
= res_info
[j
].ri_oldsize
- real_min
;
873 if (pool_resource_transfer(
874 TO_CONF(TO_ELEM(default_res
)),
875 res_info
[j
].ri_res
, default_res
, num
) !=
883 * Now, transfer resources below min from the default.
885 for (j
= 0; j
< nelem
; j
++) {
887 * We don't want to interfere with resources which are reserved
889 if (res_info
[j
].ri_res
!= default_res
&&
890 res_info
[j
].ri_oldsize
< res_info
[j
].ri_min
) {
891 if (pool_resource_transfer(
892 TO_CONF(TO_ELEM(default_res
)),
893 default_res
, res_info
[j
].ri_res
,
894 res_info
[j
].ri_min
- res_info
[j
].ri_oldsize
) !=
906 * Allocate cpus to pset resource sets, favoring sets with higher importance.
908 * Step 1: Sort resource sets by decreasing importance, and load each sets
909 * current size (oldsize), min, max, and number of pinned cpus.
910 * Compute the total number of cpus by totaling oldsize.
912 * Step 2: Compute the newsize for each set:
914 * Give each set its min number of cpus. This min may be greater than
915 * its pset.min due to pinned cpus. If there are more cpus than the total
916 * of all mins, then the surplus cpus are dealt round-robin to all sets
917 * (up to their max) in order of decreasing importance. A set may be
918 * skipped during dealing because it started with more than its min due to
919 * pinned cpus. The dealing stops when there are no more cpus or all
920 * sets are at their max. If all sets are at their max, any remaining cpus
921 * are given to the default set.
923 * Step 3: Transfer cpus from sets with (oldsize > newsize) to sets with
924 * (oldsize < newsize).
927 pset_allocate_imp(pool_resource_t
**res
, uint_t nelem
)
929 res_info_t
*res_info
;
930 res_info_t
*default_res_info
;
931 const pool_resource_t
*default_res
= NULL
;
932 uint64_t tot_resources
= 0; /* total count of resources */
933 uint64_t tot_min
= 0; /* total of all resource set mins */
934 uint64_t num_to_deal
= 0; /* total resources above mins to deal */
935 uint64_t sets_maxed
= 0; /* number of resource sets dealt to */
937 uint64_t sets_finished
= 0; /* number of resource sets that have */
938 /* size == newsize */
942 int ret
= PO_SUCCESS
;
945 * Build list of res_info_t's
947 if ((res_info
= calloc(nelem
, sizeof (res_info_t
))) == NULL
) {
948 pool_seterror(POE_SYSTEM
);
952 /* Order resources by importance, most important being first */
953 qsort(res
, nelem
, sizeof (pool_resource_t
*),
954 resource_compare_by_descending_importance
);
956 for (j
= 0; j
< nelem
; j
++) {
958 /* Track which resource is the default */
959 if (default_res
== NULL
&&
960 resource_is_default(res
[j
]) == PO_TRUE
) {
961 default_res
= res
[j
];
962 default_res_info
= &(res_info
[j
]);
965 /* Load sets' current values */
966 if (resource_get_max(res
[j
], &res_info
[j
].ri_max
) == PO_FAIL
||
967 resource_get_min(res
[j
], &res_info
[j
].ri_min
) == PO_FAIL
||
968 resource_get_size(res
[j
], &res_info
[j
].ri_oldsize
) ==
970 resource_get_pinned(res
[j
],
971 &res_info
[j
].ri_pinned
) == PO_FAIL
) {
976 /* Start each set's newsize out at their min. */
977 res_info
[j
].ri_newsize
= res_info
[j
].ri_min
;
979 /* pre-deal pinned resources that exceed min */
980 if (res_info
[j
].ri_pinned
> res_info
[j
].ri_min
) {
981 res_info
[j
].ri_newsize
= res_info
[j
].ri_pinned
;
982 res_info
[j
].ri_dealt
=
983 res_info
[j
].ri_newsize
- res_info
[j
].ri_min
;
985 res_info
[j
].ri_res
= res
[j
];
987 /* Compute total number of resources to deal out */
988 tot_resources
+= res_info
[j
].ri_oldsize
;
989 tot_min
+= res_info
[j
].ri_newsize
;
992 dprintf("res allocation details\n");
993 pool_elem_dprintf(TO_ELEM(res
[j
]));
994 dprintf("size=%llu\n", res_info
[j
].ri_oldsize
);
998 num_to_deal
= tot_resources
- tot_min
;
1001 * Deal one resource to each set, and then another, until all
1002 * resources are dealt or all sets are at their max.
1004 for (deal
= 1; num_to_deal
> 0 && sets_maxed
< nelem
; deal
++) {
1005 for (j
= 0; j
< nelem
; j
++) {
1008 * Skip this resource set if it has already been
1009 * pre-dealt a resource due to pinned resources.
1011 if (res_info
[j
].ri_dealt
>= deal
)
1014 if (res_info
[j
].ri_newsize
< res_info
[j
].ri_max
) {
1016 res_info
[j
].ri_dealt
++;
1017 res_info
[j
].ri_newsize
++;
1018 if (res_info
[j
].ri_newsize
==
1023 if (num_to_deal
== 0)
1030 * If all resource sets are at their max, deal the remaining to the
1031 * default resource set.
1033 if ((sets_maxed
== nelem
) && (num_to_deal
> 0)) {
1034 default_res_info
->ri_dealt
+= num_to_deal
;
1035 default_res_info
->ri_newsize
+= num_to_deal
;
1039 * Sort so that resource sets needing resources preced resource sets
1040 * that have extra resources. The sort function will also compute
1041 * The quantity of resources that need to be transfered into or out
1042 * of each set so that it's size == newsize.
1044 qsort(res_info
, nelem
, sizeof (res_info_t
),
1045 compute_size_to_transfer
);
1048 * The donor index starts at the end of the resource set list and
1049 * walks up. The receiver index starts at the beginning of the
1050 * resource set list and walks down. Cpu's are transfered from the
1051 * donors to the receivers until all sets have transfer == 0).
1056 /* Number of sets with transfer == 0 */
1059 /* Tranfer resources so that each set's size becomes newsize */
1063 if (donor
== receiver
) {
1064 if (res_info
[donor
].ri_transfer
!= 0) {
1071 if (res_info
[donor
].ri_transfer
== 0) {
1076 if (res_info
[receiver
].ri_transfer
== 0) {
1082 /* Transfer resources from the donor set to the receiver */
1083 ntrans
= MIN(res_info
[donor
].ri_transfer
,
1084 -res_info
[receiver
].ri_transfer
);
1086 if (pool_resource_transfer(
1087 TO_CONF(TO_ELEM(res_info
[donor
].ri_res
)),
1088 res_info
[donor
].ri_res
, res_info
[receiver
].ri_res
,
1089 ntrans
) != PO_SUCCESS
) {
1093 res_info
[donor
].ri_transfer
-= ntrans
;
1094 res_info
[receiver
].ri_transfer
+= ntrans
;
1097 if (sets_finished
!= nelem
)
1105 * Used as a qsort parameter to help order resources in terms of their
1106 * importance, higher importance being first.
1109 resource_compare_by_descending_importance(const void *arg1
, const void *arg2
)
1113 pool_resource_t
**res1
= (pool_resource_t
**)arg1
;
1114 pool_resource_t
**res2
= (pool_resource_t
**)arg2
;
1115 pool_value_t val
= POOL_VALUE_INITIALIZER
;
1116 int64_t i1
= 0, i2
= 0;
1118 elem1
= TO_ELEM(*res1
);
1119 elem2
= TO_ELEM(*res2
);
1121 if (pool_get_property(TO_CONF(elem1
), elem1
, "_importance", &val
) ==
1123 (void) pool_value_get_int64(&val
, &i1
);
1125 if (pool_get_property(TO_CONF(elem2
), elem2
, "_importance", &val
) ==
1127 (void) pool_value_get_int64(&val
, &i2
);
1128 return (i1
> i2
? -1 : (i1
< i2
? 1 : 0));
1132 * Sort in increasing order so that resource sets with extra resources are at
1133 * the end and resource sets needing resources are at the beginning.
1136 compute_size_to_transfer(const void *arg1
, const void *arg2
)
1138 res_info_t
*r1
= (res_info_t
*)arg1
, *r2
= (res_info_t
*)arg2
;
1139 r1
->ri_transfer
= (int64_t)r1
->ri_oldsize
- (int64_t)r1
->ri_newsize
;
1140 r2
->ri_transfer
= (int64_t)r2
->ri_oldsize
- (int64_t)r2
->ri_newsize
;
1141 return (r1
->ri_transfer
> r2
->ri_transfer
? 1 :
1142 (r1
->ri_transfer
< r2
->ri_transfer
? -1 : 0));
1146 * set_importance_cb() is used to create "_importance" props on each
1147 * resource associated with a pool.
1149 * Returns PO_SUCCESS/PO_FAIL
1153 set_importance_cb(pool_conf_t
*conf
, pool_t
*pool
, void *unused
)
1155 pool_value_t val
= POOL_VALUE_INITIALIZER
;
1157 pool_resource_t
**res
;
1160 if (pool_get_property(conf
, TO_ELEM(pool
), "pool.importance", &val
) !=
1162 pool_seterror(POE_INVALID_CONF
);
1165 (void) pool_value_get_int64(&val
, &importance
);
1166 if ((res
= pool_query_pool_resources(conf
, pool
, &nelem
, NULL
)) ==
1170 for (i
= 0; res
[i
] != NULL
; i
++) {
1171 int64_t old_importance
= INT64_MIN
;
1172 pool_elem_t
*elem
= TO_ELEM(res
[i
]);
1174 if (pool_get_property(conf
, elem
, "_importance", &val
) ==
1176 (void) pool_value_get_int64(&val
, &old_importance
);
1177 if (old_importance
<= importance
) {
1178 (void) pool_value_set_int64(&val
, importance
);
1179 (void) pool_put_property(conf
, elem
, "_importance",
1184 return (PO_SUCCESS
);
1188 * unset_importance_cb() is used to remove "_importance" props from
1189 * each resource associated with a pool.
1191 * Returns PO_SUCCESS/PO_FAIL
1195 unset_importance_cb(pool_conf_t
*conf
, pool_t
*pool
, void *unused
)
1197 pool_resource_t
**res
;
1200 if ((res
= pool_query_pool_resources(conf
, pool
, &nelem
, NULL
)) ==
1204 for (i
= 0; res
[i
] != NULL
; i
++) {
1205 if (pool_rm_property(conf
, TO_ELEM(res
[i
]), "_importance") ==
1212 return (PO_SUCCESS
);
1216 * add_importance_props() is used to create "_importance" props on
1217 * each resource associated with a pool.
1219 * Returns PO_SUCCESS/PO_FAIL
1222 add_importance_props(pool_conf_t
*conf
)
1224 return (pool_walk_pools(conf
, NULL
, set_importance_cb
));
1228 * remove_importance_props() is used to remove "_importance" props on
1229 * each resource associated with a pool.
1231 * Returns PO_SUCCESS/PO_FAIL
1234 remove_importance_props(pool_conf_t
*conf
)
1236 return (pool_walk_pools(conf
, NULL
, unset_importance_cb
));
1240 * pool_conf_commit_sys() takes a configuration and modifies both the
1241 * supplied configuration and the dynamic configuration. The goal of
1242 * this modification is to generate a dynamic configuration which best
1243 * represents the constraints laid down in the static configuration
1244 * and to update the static configuration with the results of this
1247 * Returns PO_SUCCESS/PO_FAIL
1250 pool_conf_commit_sys(pool_conf_t
*conf
, int validate
)
1254 if ((dyn
= pool_conf_alloc()) == NULL
)
1256 if (pool_conf_open(dyn
, pool_dynamic_location(), PO_RDWR
) !=
1258 pool_conf_free(dyn
);
1261 if (validate
== PO_TRUE
) {
1262 if (pool_conf_validate(conf
, POV_RUNTIME
) != PO_SUCCESS
) {
1263 (void) pool_conf_close(dyn
);
1264 pool_conf_free(dyn
);
1269 * Now try to make the two things "the same".
1271 if (diff_and_fix(conf
, dyn
) != PO_SUCCESS
) {
1272 (void) pool_conf_close(dyn
);
1273 pool_conf_free(dyn
);
1274 pool_seterror(POE_INVALID_CONF
);
1277 if (dyn
->pc_prov
->pc_commit(dyn
) != PO_SUCCESS
) {
1278 (void) pool_conf_close(dyn
);
1279 pool_conf_free(dyn
);
1282 (void) pool_conf_close(dyn
);
1283 pool_conf_free(dyn
);
1284 return (PO_SUCCESS
);
1288 * Copies all properties from one element to another. If the property
1289 * is a readonly property, then don't copy it.
1293 clone_element(pool_conf_t
*conf
, pool_elem_t
*pe
, const char *name
,
1294 pool_value_t
*pv
, void *user
)
1296 pool_elem_t
*tgt
= (pool_elem_t
*)user
;
1297 const pool_prop_t
*prop
;
1299 dprintf("Cloning %s from %s\n",
1300 pool_conf_location(TO_CONF(TO_ELEM(tgt
))),
1301 pool_conf_location(TO_CONF(pe
)));
1302 assert(TO_CONF(TO_ELEM(tgt
)) != TO_CONF(pe
));
1303 dprintf("clone_element: Processing %s\n", name
);
1304 pool_value_dprintf(pv
);
1307 * Some properties should be ignored
1309 if ((prop
= provider_get_prop(pe
, name
)) != NULL
&&
1310 prop_is_readonly(prop
) == PO_TRUE
)
1311 return (PO_SUCCESS
);
1313 /* The temporary property needs special handling */
1314 if (strstr(name
, ".temporary") != NULL
)
1315 return (pool_set_temporary(TO_CONF(tgt
), tgt
) ==
1316 PO_FAIL
? PO_FAIL
: PO_SUCCESS
);
1318 return (pool_put_property(TO_CONF(tgt
), tgt
, name
, pv
) ==
1319 PO_FAIL
? PO_FAIL
: PO_SUCCESS
);
1323 * Removes all properties from one element. Properties which are
1324 * managed by the configuration are ignored.
1328 clean_element(pool_conf_t
*conf
, pool_elem_t
*pe
, const char *name
,
1329 pool_value_t
*pv
, void *user
)
1331 const pool_prop_t
*prop
;
1333 * Some properties should be ignored
1335 if (strstr(name
, ".temporary") != NULL
||
1336 ((prop
= provider_get_prop(pe
, name
)) != NULL
&&
1337 prop_is_optional(prop
) == PO_FALSE
))
1338 return (PO_SUCCESS
);
1339 return (pool_rm_property(conf
, (pool_elem_t
*)pe
, name
) == PO_FAIL
);