4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2012 Milan Jurik. All rights reserved.
26 * fme.c -- fault management exercise module
28 * this module provides the simulated fault management exercise.
37 #include <libnvpair.h>
38 #include <sys/fm/protocol.h>
39 #include <fm/fmd_api.h>
57 /* imported from eft.c... */
58 extern hrtime_t Hesitate
;
59 extern char *Serd_Override
;
60 extern nv_alloc_t Eft_nv_hdl
;
62 extern fmd_hdl_t
*Hdl
;
64 static int Istat_need_save
;
65 static int Serd_need_save
;
66 void istat_save(void);
69 /* fme under construction is global so we can free it on module abort */
70 static struct fme
*Nfmep
;
72 static int Undiag_reason
= UD_VAL_UNKNOWN
;
74 static int Nextid
= 0;
76 static int Open_fme_count
= 0; /* Count of open FMEs */
78 /* list of fault management exercises underway */
80 struct fme
*next
; /* next exercise */
81 unsigned long long ull
; /* time when fme was created */
83 struct config
*config
; /* cooked configuration data */
84 struct lut
*eventtree
; /* propagation tree for this FME */
86 * The initial error report that created this FME is kept in
87 * two forms. e0 points to the instance tree node and is used
88 * by fme_eval() as the starting point for the inference
89 * algorithm. e0r is the event handle FMD passed to us when
90 * the ereport first arrived and is used when setting timers,
91 * which are always relative to the time of this initial
97 id_t timer
; /* for setting an fmd time-out */
99 struct event
*ecurrent
; /* ereport under consideration */
100 struct event
*suspects
; /* current suspect list */
101 struct event
*psuspects
; /* previous suspect list */
102 int nsuspects
; /* count of suspects */
103 int posted_suspects
; /* true if we've posted a diagnosis */
104 int uniqobs
; /* number of unique events observed */
105 int peek
; /* just peeking, don't track suspects */
106 int overflow
; /* true if overflow FME */
108 FME_NOTHING
= 5000, /* not evaluated yet */
109 FME_WAIT
, /* need to wait for more info */
110 FME_CREDIBLE
, /* suspect list is credible */
111 FME_DISPROVED
, /* no valid suspects found */
112 FME_DEFERRED
/* don't know yet (k-count not met) */
115 unsigned long long pull
; /* time passed since created */
116 unsigned long long wull
; /* wait until this time for re-eval */
117 struct event
*observations
; /* observation list */
118 struct lut
*globals
; /* values of global variables */
119 /* fmd interfacing */
120 fmd_hdl_t
*hdl
; /* handle for talking with fmd */
121 fmd_case_t
*fmcase
; /* what fmd 'case' we associate with */
123 struct stats
*Rcount
;
124 struct stats
*Hcallcount
;
125 struct stats
*Rcallcount
;
126 struct stats
*Ccallcount
;
127 struct stats
*Ecallcount
;
128 struct stats
*Tcallcount
;
129 struct stats
*Marrowcount
;
131 } *FMElist
, *EFMElist
, *ClosedFMEs
;
133 static struct case_list
{
135 struct case_list
*next
;
136 } *Undiagablecaselist
;
138 static void fme_eval(struct fme
*fmep
, fmd_event_t
*ffep
);
139 static enum fme_state
hypothesise(struct fme
*fmep
, struct event
*ep
,
140 unsigned long long at_latest_by
, unsigned long long *pdelay
);
141 static struct node
*eventprop_lookup(struct event
*ep
, const char *propname
);
142 static struct node
*pathstring2epnamenp(char *path
);
143 static void publish_undiagnosable(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
,
144 fmd_case_t
*fmcase
, nvlist_t
*detector
, char *arg
);
145 static char *undiag_2reason_str(int ud
, char *arg
);
146 static const char *undiag_2defect_str(int ud
);
147 static void restore_suspects(struct fme
*fmep
);
148 static void save_suspects(struct fme
*fmep
);
149 static void destroy_fme(struct fme
*f
);
150 static void fme_receive_report(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
,
151 const char *eventstring
, const struct ipath
*ipp
, nvlist_t
*nvl
);
152 static void istat_counter_reset_cb(struct istat_entry
*entp
,
153 struct stats
*statp
, const struct ipath
*ipp
);
154 static void istat_counter_topo_chg_cb(struct istat_entry
*entp
,
155 struct stats
*statp
, void *unused
);
156 static void serd_reset_cb(struct serd_entry
*entp
, void *unused
,
157 const struct ipath
*ipp
);
158 static void serd_topo_chg_cb(struct serd_entry
*entp
, void *unused
,
160 static void destroy_fme_bufs(struct fme
*fp
);
167 fmep
= MALLOC(sizeof (*fmep
));
168 bzero(fmep
, sizeof (*fmep
));
173 * fme_ready -- called when all initialization of the FME (except for
174 * stats) has completed successfully. Adds the fme to global lists
175 * and establishes its stats.
178 fme_ready(struct fme
*fmep
)
182 Nfmep
= NULL
; /* don't need to free this on module abort now */
185 EFMElist
->next
= fmep
;
188 FMElist
= EFMElist
= fmep
;
190 (void) sprintf(nbuf
, "fme%d.Rcount", fmep
->id
);
191 fmep
->Rcount
= stats_new_counter(nbuf
, "ereports received", 0);
192 (void) sprintf(nbuf
, "fme%d.Hcall", fmep
->id
);
193 fmep
->Hcallcount
= stats_new_counter(nbuf
, "calls to hypothesise()", 1);
194 (void) sprintf(nbuf
, "fme%d.Rcall", fmep
->id
);
195 fmep
->Rcallcount
= stats_new_counter(nbuf
,
196 "calls to requirements_test()", 1);
197 (void) sprintf(nbuf
, "fme%d.Ccall", fmep
->id
);
198 fmep
->Ccallcount
= stats_new_counter(nbuf
, "calls to causes_test()", 1);
199 (void) sprintf(nbuf
, "fme%d.Ecall", fmep
->id
);
201 stats_new_counter(nbuf
, "calls to effects_test()", 1);
202 (void) sprintf(nbuf
, "fme%d.Tcall", fmep
->id
);
203 fmep
->Tcallcount
= stats_new_counter(nbuf
, "calls to triggered()", 1);
204 (void) sprintf(nbuf
, "fme%d.Marrow", fmep
->id
);
205 fmep
->Marrowcount
= stats_new_counter(nbuf
,
206 "arrows marked by mark_arrows()", 1);
207 (void) sprintf(nbuf
, "fme%d.diags", fmep
->id
);
208 fmep
->diags
= stats_new_counter(nbuf
, "suspect lists diagnosed", 0);
210 out(O_ALTFP
|O_VERB2
, "newfme: config snapshot contains...");
211 config_print(O_ALTFP
|O_VERB2
, fmep
->config
);
216 extern void ipath_dummy_lut(struct arrow
*);
217 extern struct lut
*itree_create_dummy(const char *, const struct ipath
*);
221 set_needed_arrows(struct event
*ep
, struct event
*ep2
, struct fme
*fmep
)
224 struct arrowlist
*ap
;
226 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
227 bp
= itree_next_bubble(ep
, bp
)) {
230 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
231 ap
= itree_next_arrow(bp
, ap
)) {
232 ap
->arrowp
->pnode
->u
.arrow
.needed
= 1;
233 ipath_dummy_lut(ap
->arrowp
);
240 unset_needed_arrows(struct event
*ep
, struct event
*ep2
, struct fme
*fmep
)
243 struct arrowlist
*ap
;
245 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
246 bp
= itree_next_bubble(ep
, bp
)) {
249 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
250 ap
= itree_next_arrow(bp
, ap
))
251 ap
->arrowp
->pnode
->u
.arrow
.needed
= 0;
255 static void globals_destructor(void *left
, void *right
, void *arg
);
256 static void clear_arrows(struct event
*ep
, struct event
*ep2
, struct fme
*fmep
);
259 prune_propagations(const char *e0class
, const struct ipath
*e0ipp
)
262 unsigned long long my_delay
= TIMEVAL_EVENTUALLY
;
263 extern struct lut
*Usednames
;
267 Nfmep
->state
= FME_NOTHING
;
268 Nfmep
->eventtree
= itree_create_dummy(e0class
, e0ipp
);
270 itree_lookup(Nfmep
->eventtree
, e0class
, e0ipp
)) == NULL
) {
271 itree_free(Nfmep
->eventtree
);
276 Nfmep
->ecurrent
= Nfmep
->observations
= Nfmep
->e0
;
279 (void) sprintf(nbuf
, "fme%d.Rcount", Nfmep
->id
);
280 Nfmep
->Rcount
= stats_new_counter(nbuf
, "ereports received", 0);
281 (void) sprintf(nbuf
, "fme%d.Hcall", Nfmep
->id
);
283 stats_new_counter(nbuf
, "calls to hypothesise()", 1);
284 (void) sprintf(nbuf
, "fme%d.Rcall", Nfmep
->id
);
285 Nfmep
->Rcallcount
= stats_new_counter(nbuf
,
286 "calls to requirements_test()", 1);
287 (void) sprintf(nbuf
, "fme%d.Ccall", Nfmep
->id
);
289 stats_new_counter(nbuf
, "calls to causes_test()", 1);
290 (void) sprintf(nbuf
, "fme%d.Ecall", Nfmep
->id
);
292 stats_new_counter(nbuf
, "calls to effects_test()", 1);
293 (void) sprintf(nbuf
, "fme%d.Tcall", Nfmep
->id
);
294 Nfmep
->Tcallcount
= stats_new_counter(nbuf
, "calls to triggered()", 1);
295 (void) sprintf(nbuf
, "fme%d.Marrow", Nfmep
->id
);
296 Nfmep
->Marrowcount
= stats_new_counter(nbuf
,
297 "arrows marked by mark_arrows()", 1);
298 (void) sprintf(nbuf
, "fme%d.diags", Nfmep
->id
);
299 Nfmep
->diags
= stats_new_counter(nbuf
, "suspect lists diagnosed", 0);
302 lut_walk(Nfmep
->eventtree
, (lut_cb
)unset_needed_arrows
, (void *)Nfmep
);
303 lut_free(Usednames
, NULL
, NULL
);
305 lut_walk(Nfmep
->eventtree
, (lut_cb
)clear_arrows
, (void *)Nfmep
);
306 (void) hypothesise(Nfmep
, Nfmep
->e0
, Nfmep
->ull
, &my_delay
);
307 itree_prune(Nfmep
->eventtree
);
308 lut_walk(Nfmep
->eventtree
, (lut_cb
)set_needed_arrows
, (void *)Nfmep
);
310 stats_delete(Nfmep
->Rcount
);
311 stats_delete(Nfmep
->Hcallcount
);
312 stats_delete(Nfmep
->Rcallcount
);
313 stats_delete(Nfmep
->Ccallcount
);
314 stats_delete(Nfmep
->Ecallcount
);
315 stats_delete(Nfmep
->Tcallcount
);
316 stats_delete(Nfmep
->Marrowcount
);
317 stats_delete(Nfmep
->diags
);
318 itree_free(Nfmep
->eventtree
);
319 lut_free(Nfmep
->globals
, globals_destructor
, NULL
);
325 newfme(const char *e0class
, const struct ipath
*e0ipp
, fmd_hdl_t
*hdl
,
326 fmd_case_t
*fmcase
, fmd_event_t
*ffep
, nvlist_t
*nvl
)
328 struct cfgdata
*cfgdata
;
330 extern int alloc_total();
331 nvlist_t
*detector
= NULL
;
336 * First check if e0ipp is actually in the topology so we can give a
337 * more useful error message.
339 ipathlastcomp(e0ipp
);
340 pathstr
= ipath2str(NULL
, e0ipp
);
341 cfgdata
= config_snapshot();
342 platform_units_translate(0, cfgdata
->cooked
, NULL
, NULL
,
345 structconfig_free(cfgdata
->cooked
);
346 config_free(cfgdata
);
347 if (detector
== NULL
) {
348 /* See if class permits silent discard on unknown component. */
349 if (lut_lookup(Ereportenames_discard
, (void *)e0class
, NULL
)) {
350 out(O_ALTFP
|O_VERB2
, "Unable to map \"%s\" ereport "
351 "to component path, but silent discard allowed.",
354 Undiag_reason
= UD_VAL_BADEVENTPATH
;
355 (void) nvlist_lookup_nvlist(nvl
, FM_EREPORT_DETECTOR
,
357 arg
= ipath2str(e0class
, e0ipp
);
358 publish_undiagnosable(hdl
, ffep
, fmcase
, detector
, arg
);
365 * Next run a quick first pass of the rules with a dummy config. This
366 * allows us to prune those rules which can't possibly cause this
369 if (!prune_propagations(e0class
, e0ipp
)) {
371 * The fault class must have been in the rules or we would
372 * not have registered for it (and got a "nosub"), and the
373 * pathname must be in the topology or we would have failed the
374 * previous test. So to get here means the combination of
375 * class and pathname in the ereport must be invalid.
377 Undiag_reason
= UD_VAL_BADEVENTCLASS
;
378 arg
= ipath2str(e0class
, e0ipp
);
379 publish_undiagnosable(hdl
, ffep
, fmcase
, detector
, arg
);
380 nvlist_free(detector
);
386 * Now go ahead and create the real fme using the pruned rules.
388 init_size
= alloc_total();
389 out(O_ALTFP
|O_STAMP
, "start config_snapshot using %d bytes", init_size
);
390 nvlist_free(detector
);
391 pathstr
= ipath2str(NULL
, e0ipp
);
392 cfgdata
= config_snapshot();
393 platform_units_translate(0, cfgdata
->cooked
, NULL
, NULL
,
396 platform_save_config(hdl
, fmcase
);
397 out(O_ALTFP
|O_STAMP
, "config_snapshot added %d bytes",
398 alloc_total() - init_size
);
402 Nfmep
->id
= Nextid
++;
403 Nfmep
->config
= cfgdata
->cooked
;
404 config_free(cfgdata
);
405 Nfmep
->posted_suspects
= 0;
407 Nfmep
->state
= FME_NOTHING
;
411 Nfmep
->fmcase
= fmcase
;
414 if ((Nfmep
->eventtree
= itree_create(Nfmep
->config
)) == NULL
) {
415 Undiag_reason
= UD_VAL_INSTFAIL
;
416 arg
= ipath2str(e0class
, e0ipp
);
417 publish_undiagnosable(hdl
, ffep
, fmcase
, detector
, arg
);
418 nvlist_free(detector
);
420 structconfig_free(Nfmep
->config
);
421 destroy_fme_bufs(Nfmep
);
427 itree_ptree(O_ALTFP
|O_VERB2
, Nfmep
->eventtree
);
430 itree_lookup(Nfmep
->eventtree
, e0class
, e0ipp
)) == NULL
) {
431 Undiag_reason
= UD_VAL_BADEVENTI
;
432 arg
= ipath2str(e0class
, e0ipp
);
433 publish_undiagnosable(hdl
, ffep
, fmcase
, detector
, arg
);
434 nvlist_free(detector
);
436 itree_free(Nfmep
->eventtree
);
437 structconfig_free(Nfmep
->config
);
438 destroy_fme_bufs(Nfmep
);
444 nvlist_free(detector
);
445 return (fme_ready(Nfmep
));
451 struct fme
*sfp
, *fp
;
452 struct case_list
*ucasep
, *nextcasep
;
454 ucasep
= Undiagablecaselist
;
455 while (ucasep
!= NULL
) {
456 nextcasep
= ucasep
->next
;
460 Undiagablecaselist
= NULL
;
462 /* clean up closed fmes */
477 FMElist
= EFMElist
= NULL
;
479 /* if we were in the middle of creating an fme, free it now */
487 * Allocated space for a buffer name. 20 bytes allows for
488 * a ridiculous 9,999,999 unique observations.
493 * serialize_observation
495 * Create a recoverable version of the current observation
496 * (f->ecurrent). We keep a serialized version of each unique
497 * observation in order that we may resume correctly the fme in the
498 * correct state if eft or fmd crashes and we're restarted.
501 serialize_observation(struct fme
*fp
, const char *cls
, const struct ipath
*ipp
)
504 char tmpbuf
[OBBUFNMSZ
];
508 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed%d", fp
->uniqobs
);
509 estr
= ipath2str(cls
, ipp
);
510 fmd_buf_create(fp
->hdl
, fp
->fmcase
, tmpbuf
, strlen(estr
) + 1);
511 fmd_buf_write(fp
->hdl
, fp
->fmcase
, tmpbuf
, (void *)estr
,
515 if (fp
->ecurrent
!= NULL
&& fp
->ecurrent
->nvp
!= NULL
) {
516 (void) snprintf(tmpbuf
,
517 OBBUFNMSZ
, "observed%d.nvp", fp
->uniqobs
);
518 if (nvlist_xpack(fp
->ecurrent
->nvp
,
519 &pkd
, &pkdlen
, NV_ENCODE_XDR
, &Eft_nv_hdl
) != 0)
520 out(O_DIE
|O_SYS
, "pack of observed nvl failed");
521 fmd_buf_create(fp
->hdl
, fp
->fmcase
, tmpbuf
, pkdlen
);
522 fmd_buf_write(fp
->hdl
, fp
->fmcase
, tmpbuf
, (void *)pkd
, pkdlen
);
527 fmd_buf_write(fp
->hdl
, fp
->fmcase
, WOBUF_NOBS
, (void *)&fp
->uniqobs
,
528 sizeof (fp
->uniqobs
));
532 * init_fme_bufs -- We keep several bits of state about an fme for
533 * use if eft or fmd crashes and we're restarted.
536 init_fme_bufs(struct fme
*fp
)
538 fmd_buf_create(fp
->hdl
, fp
->fmcase
, WOBUF_PULL
, sizeof (fp
->pull
));
539 fmd_buf_write(fp
->hdl
, fp
->fmcase
, WOBUF_PULL
, (void *)&fp
->pull
,
542 fmd_buf_create(fp
->hdl
, fp
->fmcase
, WOBUF_ID
, sizeof (fp
->id
));
543 fmd_buf_write(fp
->hdl
, fp
->fmcase
, WOBUF_ID
, (void *)&fp
->id
,
546 fmd_buf_create(fp
->hdl
, fp
->fmcase
, WOBUF_NOBS
, sizeof (fp
->uniqobs
));
547 fmd_buf_write(fp
->hdl
, fp
->fmcase
, WOBUF_NOBS
, (void *)&fp
->uniqobs
,
548 sizeof (fp
->uniqobs
));
550 fmd_buf_create(fp
->hdl
, fp
->fmcase
, WOBUF_POSTD
,
551 sizeof (fp
->posted_suspects
));
552 fmd_buf_write(fp
->hdl
, fp
->fmcase
, WOBUF_POSTD
,
553 (void *)&fp
->posted_suspects
, sizeof (fp
->posted_suspects
));
557 destroy_fme_bufs(struct fme
*fp
)
559 char tmpbuf
[OBBUFNMSZ
];
562 platform_restore_config(fp
->hdl
, fp
->fmcase
);
563 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_CFGLEN
);
564 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_CFG
);
565 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_PULL
);
566 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_ID
);
567 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_POSTD
);
568 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, WOBUF_NOBS
);
570 for (o
= 0; o
< fp
->uniqobs
; o
++) {
571 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed%d", o
);
572 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, tmpbuf
);
573 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed%d.nvp", o
);
574 fmd_buf_destroy(fp
->hdl
, fp
->fmcase
, tmpbuf
);
579 * reconstitute_observations -- convert a case's serialized observations
580 * back into struct events. Returns zero if all observations are
581 * successfully reconstituted.
584 reconstitute_observations(struct fme
*fmep
)
587 struct node
*epnamenp
= NULL
;
590 char *tmpbuf
= alloca(OBBUFNMSZ
);
596 for (ocnt
= 0; ocnt
< fmep
->uniqobs
; ocnt
++) {
597 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed%d", ocnt
);
598 elen
= fmd_buf_size(fmep
->hdl
, fmep
->fmcase
, tmpbuf
);
601 "reconstitute_observation: no %s buffer found.",
603 Undiag_reason
= UD_VAL_MISSINGOBS
;
608 fmd_buf_read(fmep
->hdl
, fmep
->fmcase
, tmpbuf
, estr
, elen
);
609 sepptr
= strchr(estr
, '@');
610 if (sepptr
== NULL
) {
612 "reconstitute_observation: %s: "
613 "missing @ separator in %s.",
615 Undiag_reason
= UD_VAL_MISSINGPATH
;
621 if ((epnamenp
= pathstring2epnamenp(sepptr
+ 1)) == NULL
) {
623 "reconstitute_observation: %s: "
624 "trouble converting path string \"%s\" "
625 "to internal representation.",
627 Undiag_reason
= UD_VAL_MISSINGPATH
;
632 /* construct the event */
633 ep
= itree_lookup(fmep
->eventtree
,
634 stable(estr
), ipath(epnamenp
));
637 "reconstitute_observation: %s: "
638 "lookup of \"%s\" in itree failed.",
639 tmpbuf
, ipath2str(estr
, ipath(epnamenp
)));
640 Undiag_reason
= UD_VAL_BADOBS
;
648 * We may or may not have a saved nvlist for the observation
650 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed%d.nvp", ocnt
);
651 pkdlen
= fmd_buf_size(fmep
->hdl
, fmep
->fmcase
, tmpbuf
);
653 pkd
= MALLOC(pkdlen
);
654 fmd_buf_read(fmep
->hdl
,
655 fmep
->fmcase
, tmpbuf
, pkd
, pkdlen
);
656 ASSERT(ep
->nvp
== NULL
);
657 if (nvlist_xunpack(pkd
,
658 pkdlen
, &ep
->nvp
, &Eft_nv_hdl
) != 0)
659 out(O_DIE
|O_SYS
, "pack of observed nvl failed");
670 /* link it into list of observations seen */
671 ep
->observations
= fmep
->observations
;
672 fmep
->observations
= ep
;
675 if (ocnt
== fmep
->uniqobs
) {
676 (void) fme_ready(fmep
);
684 * restart_fme -- called during eft initialization. Reconstitutes
685 * an in-progress fme.
688 fme_restart(fmd_hdl_t
*hdl
, fmd_case_t
*inprogress
)
691 struct case_list
*bad
;
693 struct cfgdata
*cfgdata
;
696 char *tmpbuf
= alloca(OBBUFNMSZ
);
700 struct node
*epnamenp
= NULL
;
702 extern int alloc_total();
706 * ignore solved or closed cases
708 if (fmd_case_solved(hdl
, inprogress
) ||
709 fmd_case_closed(hdl
, inprogress
))
713 fmep
->fmcase
= inprogress
;
716 if (fmd_buf_size(hdl
, inprogress
, WOBUF_POSTD
) == 0) {
717 out(O_ALTFP
, "restart_fme: no saved posted status");
718 Undiag_reason
= UD_VAL_MISSINGINFO
;
721 fmd_buf_read(hdl
, inprogress
, WOBUF_POSTD
,
722 (void *)&fmep
->posted_suspects
,
723 sizeof (fmep
->posted_suspects
));
726 if (fmd_buf_size(hdl
, inprogress
, WOBUF_ID
) == 0) {
727 out(O_ALTFP
, "restart_fme: no saved id");
728 Undiag_reason
= UD_VAL_MISSINGINFO
;
731 fmd_buf_read(hdl
, inprogress
, WOBUF_ID
, (void *)&fmep
->id
,
734 if (Nextid
<= fmep
->id
)
735 Nextid
= fmep
->id
+ 1;
737 out(O_ALTFP
, "Replay FME %d", fmep
->id
);
739 if (fmd_buf_size(hdl
, inprogress
, WOBUF_CFGLEN
) != sizeof (size_t)) {
740 out(O_ALTFP
, "restart_fme: No config data");
741 Undiag_reason
= UD_VAL_MISSINGINFO
;
744 fmd_buf_read(hdl
, inprogress
, WOBUF_CFGLEN
, (void *)&rawsz
,
747 if ((fmep
->e0r
= fmd_case_getprincipal(hdl
, inprogress
)) == NULL
) {
748 out(O_ALTFP
, "restart_fme: No event zero");
749 Undiag_reason
= UD_VAL_MISSINGZERO
;
753 if (fmd_buf_size(hdl
, inprogress
, WOBUF_PULL
) == 0) {
754 out(O_ALTFP
, "restart_fme: no saved wait time");
755 Undiag_reason
= UD_VAL_MISSINGINFO
;
758 fmd_buf_read(hdl
, inprogress
, WOBUF_PULL
, (void *)&fmep
->pull
,
759 sizeof (fmep
->pull
));
762 if (fmd_buf_size(hdl
, inprogress
, WOBUF_NOBS
) == 0) {
763 out(O_ALTFP
, "restart_fme: no count of observations");
764 Undiag_reason
= UD_VAL_MISSINGINFO
;
767 fmd_buf_read(hdl
, inprogress
, WOBUF_NOBS
,
768 (void *)&fmep
->uniqobs
, sizeof (fmep
->uniqobs
));
771 (void) snprintf(tmpbuf
, OBBUFNMSZ
, "observed0");
772 elen
= fmd_buf_size(fmep
->hdl
, fmep
->fmcase
, tmpbuf
);
774 out(O_ALTFP
, "reconstitute_observation: no %s buffer found.",
776 Undiag_reason
= UD_VAL_MISSINGOBS
;
780 fmd_buf_read(fmep
->hdl
, fmep
->fmcase
, tmpbuf
, estr
, elen
);
781 sepptr
= strchr(estr
, '@');
782 if (sepptr
== NULL
) {
783 out(O_ALTFP
, "reconstitute_observation: %s: "
784 "missing @ separator in %s.",
786 Undiag_reason
= UD_VAL_MISSINGPATH
;
791 if ((epnamenp
= pathstring2epnamenp(sepptr
+ 1)) == NULL
) {
792 out(O_ALTFP
, "reconstitute_observation: %s: "
793 "trouble converting path string \"%s\" "
794 "to internal representation.", tmpbuf
, sepptr
+ 1);
795 Undiag_reason
= UD_VAL_MISSINGPATH
;
799 (void) prune_propagations(stable(estr
), ipath(epnamenp
));
803 init_size
= alloc_total();
804 out(O_ALTFP
|O_STAMP
, "start config_restore using %d bytes", init_size
);
805 cfgdata
= MALLOC(sizeof (struct cfgdata
));
806 cfgdata
->cooked
= NULL
;
807 cfgdata
->devcache
= NULL
;
808 cfgdata
->devidcache
= NULL
;
809 cfgdata
->tpcache
= NULL
;
810 cfgdata
->cpucache
= NULL
;
811 cfgdata
->raw_refcnt
= 1;
814 if (fmd_buf_size(hdl
, inprogress
, WOBUF_CFG
) != rawsz
) {
815 out(O_ALTFP
, "restart_fme: Config data size mismatch");
816 Undiag_reason
= UD_VAL_CFGMISMATCH
;
819 cfgdata
->begin
= MALLOC(rawsz
);
820 cfgdata
->end
= cfgdata
->nextfree
= cfgdata
->begin
+ rawsz
;
822 inprogress
, WOBUF_CFG
, cfgdata
->begin
, rawsz
);
824 cfgdata
->begin
= cfgdata
->end
= cfgdata
->nextfree
= NULL
;
827 config_cook(cfgdata
);
828 fmep
->config
= cfgdata
->cooked
;
829 config_free(cfgdata
);
830 out(O_ALTFP
|O_STAMP
, "config_restore added %d bytes",
831 alloc_total() - init_size
);
833 if ((fmep
->eventtree
= itree_create(fmep
->config
)) == NULL
) {
834 /* case not properly saved or irretrievable */
835 out(O_ALTFP
, "restart_fme: NULL instance tree");
836 Undiag_reason
= UD_VAL_INSTFAIL
;
840 itree_ptree(O_ALTFP
|O_VERB2
, fmep
->eventtree
);
842 if (reconstitute_observations(fmep
) != 0)
845 out(O_ALTFP
|O_NONL
, "FME %d replay observations: ", fmep
->id
);
846 for (ep
= fmep
->observations
; ep
; ep
= ep
->observations
) {
847 out(O_ALTFP
|O_NONL
, " ");
848 itree_pevent_brief(O_ALTFP
|O_NONL
, ep
);
854 /* give the diagnosis algorithm a shot at the new FME state */
855 fme_eval(fmep
, fmep
->e0r
);
859 if (fmep
->eventtree
!= NULL
)
860 itree_free(fmep
->eventtree
);
862 structconfig_free(fmep
->config
);
863 destroy_fme_bufs(fmep
);
867 * Since we're unable to restart the case, add it to the undiagable
868 * list and solve and close it as appropriate.
870 bad
= MALLOC(sizeof (struct case_list
));
873 if (Undiagablecaselist
!= NULL
)
874 bad
->next
= Undiagablecaselist
;
875 Undiagablecaselist
= bad
;
876 bad
->fmcase
= inprogress
;
878 out(O_ALTFP
|O_NONL
, "[case %s (unable to restart), ",
879 fmd_case_uuid(hdl
, bad
->fmcase
));
881 if (fmd_case_solved(hdl
, bad
->fmcase
)) {
882 out(O_ALTFP
|O_NONL
, "already solved, ");
884 out(O_ALTFP
|O_NONL
, "solving, ");
885 defect
= fmd_nvl_create_fault(hdl
,
886 undiag_2defect_str(Undiag_reason
), 100, NULL
, NULL
, NULL
);
887 reason
= undiag_2reason_str(Undiag_reason
, NULL
);
888 (void) nvlist_add_string(defect
, UNDIAG_REASON
, reason
);
890 fmd_case_add_suspect(hdl
, bad
->fmcase
, defect
);
891 fmd_case_solve(hdl
, bad
->fmcase
);
892 Undiag_reason
= UD_VAL_UNKNOWN
;
895 if (fmd_case_closed(hdl
, bad
->fmcase
)) {
896 out(O_ALTFP
, "already closed ]");
898 out(O_ALTFP
, "closing ]");
899 fmd_case_close(hdl
, bad
->fmcase
);
905 globals_destructor(void *left
, void *right
, void *arg
)
907 struct evalue
*evp
= (struct evalue
*)right
;
908 if (evp
->t
== NODEPTR
)
909 tree_free((struct node
*)(uintptr_t)evp
->v
);
910 evp
->v
= (uintptr_t)NULL
;
915 destroy_fme(struct fme
*f
)
917 stats_delete(f
->Rcount
);
918 stats_delete(f
->Hcallcount
);
919 stats_delete(f
->Rcallcount
);
920 stats_delete(f
->Ccallcount
);
921 stats_delete(f
->Ecallcount
);
922 stats_delete(f
->Tcallcount
);
923 stats_delete(f
->Marrowcount
);
924 stats_delete(f
->diags
);
926 if (f
->eventtree
!= NULL
)
927 itree_free(f
->eventtree
);
929 structconfig_free(f
->config
);
930 lut_free(f
->globals
, globals_destructor
, NULL
);
935 fme_state2str(enum fme_state s
)
938 case FME_NOTHING
: return ("NOTHING");
939 case FME_WAIT
: return ("WAIT");
940 case FME_CREDIBLE
: return ("CREDIBLE");
941 case FME_DISPROVED
: return ("DISPROVED");
942 case FME_DEFERRED
: return ("DEFERRED");
943 default: return ("UNKNOWN");
948 is_problem(enum nametype t
)
950 return (t
== N_FAULT
|| t
== N_DEFECT
|| t
== N_UPSET
);
954 is_defect(enum nametype t
)
956 return (t
== N_DEFECT
);
960 is_upset(enum nametype t
)
962 return (t
== N_UPSET
);
966 fme_print(int flags
, struct fme
*fmep
)
970 out(flags
, "Fault Management Exercise %d", fmep
->id
);
971 out(flags
, "\t State: %s", fme_state2str(fmep
->state
));
972 out(flags
|O_NONL
, "\t Start time: ");
973 ptree_timeval(flags
|O_NONL
, &fmep
->ull
);
976 out(flags
|O_NONL
, "\t Wait time: ");
977 ptree_timeval(flags
|O_NONL
, &fmep
->wull
);
980 out(flags
|O_NONL
, "\t E0: ");
982 itree_pevent_brief(flags
|O_NONL
, fmep
->e0
);
984 out(flags
|O_NONL
, "NULL");
986 out(flags
|O_NONL
, "\tObservations:");
987 for (ep
= fmep
->observations
; ep
; ep
= ep
->observations
) {
988 out(flags
|O_NONL
, " ");
989 itree_pevent_brief(flags
|O_NONL
, ep
);
992 out(flags
|O_NONL
, "\tSuspect list:");
993 for (ep
= fmep
->suspects
; ep
; ep
= ep
->suspects
) {
994 out(flags
|O_NONL
, " ");
995 itree_pevent_brief(flags
|O_NONL
, ep
);
998 if (fmep
->eventtree
!= NULL
) {
999 out(flags
|O_VERB2
, "\t Tree:");
1000 itree_ptree(flags
|O_VERB2
, fmep
->eventtree
);
1004 static struct node
*
1005 pathstring2epnamenp(char *path
)
1011 if ((ptr
= strtok(path
, sep
)) == NULL
)
1012 out(O_DIE
, "pathstring2epnamenp: invalid empty class");
1014 ret
= tree_iname(stable(ptr
), NULL
, 0);
1016 while ((ptr
= strtok(NULL
, sep
)) != NULL
)
1017 ret
= tree_name_append(ret
,
1018 tree_iname(stable(ptr
), NULL
, 0));
1024 * for a given upset sp, increment the corresponding SERD engine. if the
1025 * SERD engine trips, return the ename and ipp of the resulting ereport.
1026 * returns true if engine tripped and *enamep and *ippp were filled in.
1029 serd_eval(struct fme
*fmep
, fmd_hdl_t
*hdl
, fmd_event_t
*ffep
,
1030 fmd_case_t
*fmcase
, struct event
*sp
, const char **enamep
,
1031 const struct ipath
**ippp
)
1033 struct node
*serdinst
;
1038 struct serd_entry
*newentp
;
1039 int i
, serdn
= -1, serdincrement
= 1, len
= 0;
1040 char *serdsuffix
= NULL
, *serdt
= NULL
;
1043 ASSERT(sp
->t
== N_UPSET
);
1044 ASSERT(ffep
!= NULL
);
1046 if ((ep
= (struct evalue
*)lut_lookup(sp
->serdprops
,
1047 (void *)"n", (lut_cmp
)strcmp
)) != NULL
) {
1048 ASSERT(ep
->t
== UINT64
);
1051 if ((ep
= (struct evalue
*)lut_lookup(sp
->serdprops
,
1052 (void *)"t", (lut_cmp
)strcmp
)) != NULL
) {
1053 ASSERT(ep
->t
== STRING
);
1054 serdt
= (char *)(uintptr_t)ep
->v
;
1056 if ((ep
= (struct evalue
*)lut_lookup(sp
->serdprops
,
1057 (void *)"suffix", (lut_cmp
)strcmp
)) != NULL
) {
1058 ASSERT(ep
->t
== STRING
);
1059 serdsuffix
= (char *)(uintptr_t)ep
->v
;
1061 if ((ep
= (struct evalue
*)lut_lookup(sp
->serdprops
,
1062 (void *)"increment", (lut_cmp
)strcmp
)) != NULL
) {
1063 ASSERT(ep
->t
== UINT64
);
1064 serdincrement
= (int)ep
->v
;
1068 * obtain instanced SERD engine from the upset sp. from this
1069 * derive serdname, the string used to identify the SERD engine.
1071 serdinst
= eventprop_lookup(sp
, L_engine
);
1073 if (serdinst
== NULL
)
1076 len
= strlen(serdinst
->u
.stmt
.np
->u
.event
.ename
->u
.name
.s
) + 1;
1077 if (serdsuffix
!= NULL
)
1078 len
+= strlen(serdsuffix
);
1079 serdclass
= MALLOC(len
);
1080 if (serdsuffix
!= NULL
)
1081 (void) snprintf(serdclass
, len
, "%s%s",
1082 serdinst
->u
.stmt
.np
->u
.event
.ename
->u
.name
.s
, serdsuffix
);
1084 (void) snprintf(serdclass
, len
, "%s",
1085 serdinst
->u
.stmt
.np
->u
.event
.ename
->u
.name
.s
);
1086 serdresource
= ipath2str(NULL
,
1087 ipath(serdinst
->u
.stmt
.np
->u
.event
.epname
));
1088 len
+= strlen(serdresource
) + 1;
1089 serdname
= MALLOC(len
);
1090 (void) snprintf(serdname
, len
, "%s@%s", serdclass
, serdresource
);
1093 /* handle serd engine "id" property, if there is one */
1095 lut_lookup(serdinst
->u
.stmt
.lutp
, (void *)L_id
, NULL
)) != NULL
) {
1096 struct evalue
*gval
;
1097 char suffixbuf
[200];
1102 out(O_ALTFP
|O_NONL
, "serd \"%s\" id: ", serdname
);
1103 ptree_name_iter(O_ALTFP
|O_NONL
, nid
);
1105 ASSERTinfo(nid
->t
== T_GLOBID
, ptree_nodetype2str(nid
->t
));
1107 if ((gval
= lut_lookup(fmep
->globals
,
1108 (void *)nid
->u
.globid
.s
, NULL
)) == NULL
) {
1109 out(O_ALTFP
, " undefined");
1110 } else if (gval
->t
== UINT64
) {
1111 out(O_ALTFP
, " %llu", gval
->v
);
1112 (void) sprintf(suffixbuf
, "%llu", gval
->v
);
1115 out(O_ALTFP
, " \"%s\"", (char *)(uintptr_t)gval
->v
);
1116 suffix
= (char *)(uintptr_t)gval
->v
;
1119 nname
= strlen(serdname
) + strlen(suffix
) + 2;
1120 nserdname
= MALLOC(nname
);
1121 (void) snprintf(nserdname
, nname
, "%s:%s", serdname
, suffix
);
1123 serdname
= nserdname
;
1127 * if the engine is empty, and we have an override for n/t then
1128 * destroy and recreate it.
1130 if ((serdn
!= -1 || serdt
!= NULL
) && fmd_serd_exists(hdl
, serdname
) &&
1131 fmd_serd_empty(hdl
, serdname
))
1132 fmd_serd_destroy(hdl
, serdname
);
1134 if (!fmd_serd_exists(hdl
, serdname
)) {
1135 struct node
*nN
, *nT
;
1144 int got_n_override
= 0, got_t_override
= 0;
1146 /* no SERD engine yet, so create it */
1147 nodep
= serdinst
->u
.stmt
.np
->u
.event
.epname
;
1148 path
= ipath2str(NULL
, ipath(nodep
));
1149 cp
= config_lookup(fmep
->config
, path
, 0);
1153 * We allow serd paramaters to be overridden, either from
1154 * eft.conf file values (if Serd_Override is set) or from
1155 * driver properties (for "serd.io.device" engines).
1157 if (Serd_Override
!= NULL
) {
1158 char *save_ptr
, *ptr1
, *ptr2
, *ptr3
;
1159 ptr3
= save_ptr
= STRDUP(Serd_Override
);
1160 while (*ptr3
!= '\0') {
1161 ptr1
= strchr(ptr3
, ',');
1163 if (strcmp(ptr3
, serdclass
) == 0) {
1164 ptr2
= strchr(ptr1
+ 1, ',');
1166 nval
= atoi(ptr1
+ 1);
1167 out(O_ALTFP
, "serd override %s_n %d",
1169 ptr3
= strchr(ptr2
+ 1, ' ');
1172 ptr
= STRDUP(ptr2
+ 1);
1173 out(O_ALTFP
, "serd override %s_t %s",
1179 ptr2
= strchr(ptr1
+ 1, ',');
1180 ptr3
= strchr(ptr2
+ 1, ' ');
1189 if (cp
&& got_n_override
== 0) {
1191 * convert serd engine class into property name
1193 char *prop_name
= MALLOC(strlen(serdclass
) + 3);
1194 for (i
= 0; i
< strlen(serdclass
); i
++) {
1195 if (serdclass
[i
] == '.')
1198 prop_name
[i
] = serdclass
[i
];
1200 prop_name
[i
++] = '_';
1201 prop_name
[i
++] = 'n';
1202 prop_name
[i
] = '\0';
1203 if (s
= config_getprop(cp
, prop_name
)) {
1205 out(O_ALTFP
, "serd override %s_n %s",
1209 prop_name
[i
- 1] = 't';
1210 if (s
= config_getprop(cp
, prop_name
)) {
1212 out(O_ALTFP
, "serd override %s_t %s",
1219 if (serdn
!= -1 && got_n_override
== 0) {
1221 out(O_ALTFP
, "serd override %s_n %d", serdclass
, serdn
);
1224 if (serdt
!= NULL
&& got_t_override
== 0) {
1225 ptr
= STRDUP(serdt
);
1226 out(O_ALTFP
, "serd override %s_t %s", serdclass
, serdt
);
1230 if (!got_n_override
) {
1231 nN
= lut_lookup(serdinst
->u
.stmt
.lutp
, (void *)L_N
,
1233 ASSERT(nN
->t
== T_NUM
);
1234 nval
= (uint_t
)nN
->u
.ull
;
1236 if (!got_t_override
) {
1237 nT
= lut_lookup(serdinst
->u
.stmt
.lutp
, (void *)L_T
,
1239 ASSERT(nT
->t
== T_TIMEVAL
);
1240 tval
= (hrtime_t
)nT
->u
.ull
;
1242 const unsigned long long *ullp
;
1246 len
= strspn(ptr
, "0123456789");
1247 suffix
= stable(&ptr
[len
]);
1248 ullp
= (unsigned long long *)lut_lookup(Timesuffixlut
,
1249 (void *)suffix
, NULL
);
1251 tval
= strtoull(ptr
, NULL
, 0) * (ullp
? *ullp
: 1ll);
1254 fmd_serd_create(hdl
, serdname
, nval
, tval
);
1257 newentp
= MALLOC(sizeof (*newentp
));
1258 newentp
->ename
= stable(serdclass
);
1260 newentp
->ipath
= ipath(serdinst
->u
.stmt
.np
->u
.event
.epname
);
1262 if (lut_lookup(SerdEngines
, newentp
, (lut_cmp
)serd_cmp
) == NULL
) {
1263 SerdEngines
= lut_add(SerdEngines
, (void *)newentp
,
1264 (void *)newentp
, (lut_cmp
)serd_cmp
);
1273 * increment SERD engine. if engine fires, reset serd
1274 * engine and return trip_strcode if required.
1276 for (i
= 0; i
< serdincrement
; i
++) {
1277 if (fmd_serd_record(hdl
, serdname
, ffep
)) {
1278 fmd_case_add_serd(hdl
, fmcase
, serdname
);
1279 fmd_serd_reset(hdl
, serdname
);
1282 struct node
*tripinst
=
1283 lut_lookup(serdinst
->u
.stmt
.lutp
,
1284 (void *)L_trip
, NULL
);
1285 ASSERT(tripinst
!= NULL
);
1286 *enamep
= tripinst
->u
.event
.ename
->u
.name
.s
;
1287 *ippp
= ipath(tripinst
->u
.event
.epname
);
1289 "[engine fired: %s, sending: ", serdname
);
1290 ipath_print(O_ALTFP
|O_NONL
, *enamep
, *ippp
);
1293 out(O_ALTFP
, "[engine fired: %s, no trip]",
1306 * search a suspect list for upsets. feed each upset to serd_eval() and
1307 * build up tripped[], an array of ereports produced by the firing of
1308 * any SERD engines. then feed each ereport back into
1309 * fme_receive_report().
1311 * returns ntrip, the number of these ereports produced.
1314 upsets_eval(struct fme
*fmep
, fmd_event_t
*ffep
)
1316 /* we build an array of tripped ereports that we send ourselves */
1319 const struct ipath
*ipp
;
1322 int ntrip
, nupset
, i
;
1325 * count the number of upsets to determine the upper limit on
1326 * expected trip ereport strings. remember that one upset can
1327 * lead to at most one ereport.
1330 for (sp
= fmep
->suspects
; sp
; sp
= sp
->suspects
) {
1331 if (sp
->t
== N_UPSET
)
1339 * get to this point if we have upsets and expect some trip
1342 tripped
= alloca(sizeof (*tripped
) * nupset
);
1343 bzero((void *)tripped
, sizeof (*tripped
) * nupset
);
1346 for (sp
= fmep
->suspects
; sp
; sp
= sp
->suspects
)
1347 if (sp
->t
== N_UPSET
&&
1348 serd_eval(fmep
, fmep
->hdl
, ffep
, fmep
->fmcase
, sp
,
1349 &tripped
[ntrip
].ename
, &tripped
[ntrip
].ipp
) == 1)
1352 for (i
= 0; i
< ntrip
; i
++) {
1353 struct event
*ep
, *nep
;
1356 const struct ipath
*ipp
;
1357 const char *eventstring
;
1359 unsigned long long my_delay
= TIMEVAL_EVENTUALLY
;
1360 enum fme_state state
;
1363 * First try and evaluate a case with the trip ereport plus
1364 * all the other ereports that cause the trip. If that fails
1365 * to evaluate then try again with just this ereport on its own.
1367 out(O_ALTFP
|O_NONL
, "fme_receive_report_serd: ");
1368 ipath_print(O_ALTFP
|O_NONL
, tripped
[i
].ename
, tripped
[i
].ipp
);
1369 out(O_ALTFP
|O_STAMP
, NULL
);
1371 eventstring
= ep
->enode
->u
.event
.ename
->u
.name
.s
;
1375 * create a duplicate fme and case
1377 fmcase
= fmd_case_open(fmep
->hdl
, NULL
);
1378 out(O_ALTFP
|O_NONL
, "duplicate fme for event [");
1379 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1382 if ((nfmep
= newfme(eventstring
, ipp
, fmep
->hdl
,
1383 fmcase
, ffep
, ep
->nvp
)) == NULL
) {
1384 out(O_ALTFP
|O_NONL
, "[");
1385 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1386 out(O_ALTFP
, " CANNOT DIAGNOSE]");
1391 nfmep
->pull
= fmep
->pull
;
1392 init_fme_bufs(nfmep
);
1393 out(O_ALTFP
|O_NONL
, "[");
1394 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1395 out(O_ALTFP
, " created FME%d, case %s]", nfmep
->id
,
1396 fmd_case_uuid(nfmep
->hdl
, nfmep
->fmcase
));
1398 fmd_case_setprincipal(nfmep
->hdl
, nfmep
->fmcase
, ffep
);
1399 fmd_case_add_ereport(nfmep
->hdl
, nfmep
->fmcase
, ffep
);
1404 * add the original ereports
1406 for (ep
= fmep
->observations
; ep
; ep
= ep
->observations
) {
1407 eventstring
= ep
->enode
->u
.event
.ename
->u
.name
.s
;
1409 out(O_ALTFP
|O_NONL
, "adding event [");
1410 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1412 nep
= itree_lookup(nfmep
->eventtree
, eventstring
, ipp
);
1413 if (nep
->count
++ == 0) {
1414 nep
->observations
= nfmep
->observations
;
1415 nfmep
->observations
= nep
;
1416 serialize_observation(nfmep
, eventstring
, ipp
);
1417 nep
->nvp
= evnv_dupnvl(ep
->nvp
);
1419 if (ep
->ffep
&& ep
->ffep
!= ffep
)
1420 fmd_case_add_ereport(nfmep
->hdl
, nfmep
->fmcase
,
1422 stats_counter_bump(nfmep
->Rcount
);
1426 * add the serd trigger ereport
1428 if ((ep
= itree_lookup(nfmep
->eventtree
, tripped
[i
].ename
,
1429 tripped
[i
].ipp
)) == NULL
) {
1431 * The trigger ereport is not in the instance tree. It
1432 * was presumably removed by prune_propagations() as
1433 * this combination of events is not present in the
1436 out(O_ALTFP
, "upsets_eval: e0 not in instance tree");
1437 Undiag_reason
= UD_VAL_BADEVENTI
;
1438 goto retry_lone_ereport
;
1440 out(O_ALTFP
|O_NONL
, "adding event [");
1441 ipath_print(O_ALTFP
|O_NONL
, tripped
[i
].ename
, tripped
[i
].ipp
);
1443 nfmep
->ecurrent
= ep
;
1446 ep
->observations
= nfmep
->observations
;
1447 nfmep
->observations
= ep
;
1453 prev_verbose
= Verbose
;
1456 lut_walk(nfmep
->eventtree
, (lut_cb
)clear_arrows
, (void *)nfmep
);
1457 state
= hypothesise(nfmep
, nfmep
->e0
, nfmep
->ull
, &my_delay
);
1459 Verbose
= prev_verbose
;
1460 if (state
== FME_DISPROVED
) {
1461 out(O_ALTFP
, "upsets_eval: hypothesis disproved");
1462 Undiag_reason
= UD_VAL_UNSOLVD
;
1465 * However the trigger ereport on its own might be
1466 * diagnosable, so check for that. Undo the new fme
1467 * and case we just created and call fme_receive_report.
1469 out(O_ALTFP
|O_NONL
, "[");
1470 ipath_print(O_ALTFP
|O_NONL
, tripped
[i
].ename
,
1472 out(O_ALTFP
, " retrying with just trigger ereport]");
1473 itree_free(nfmep
->eventtree
);
1474 nfmep
->eventtree
= NULL
;
1475 structconfig_free(nfmep
->config
);
1476 nfmep
->config
= NULL
;
1477 destroy_fme_bufs(nfmep
);
1478 fmd_case_close(nfmep
->hdl
, nfmep
->fmcase
);
1479 fme_receive_report(fmep
->hdl
, ffep
,
1480 tripped
[i
].ename
, tripped
[i
].ipp
, NULL
);
1487 serialize_observation(nfmep
, tripped
[i
].ename
, tripped
[i
].ipp
);
1488 fme_eval(nfmep
, ffep
);
1495 * fme_receive_external_report -- call when an external ereport comes in
1497 * this routine just converts the relevant information from the ereport
1498 * into a format used internally and passes it on to fme_receive_report().
1501 fme_receive_external_report(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
, nvlist_t
*nvl
,
1504 struct node
*epnamenp
;
1506 const struct ipath
*ipp
;
1507 nvlist_t
*detector
= NULL
;
1509 class = stable(class);
1511 /* Get the component path from the ereport */
1512 epnamenp
= platform_getpath(nvl
);
1514 /* See if we ended up without a path. */
1515 if (epnamenp
== NULL
) {
1516 /* See if class permits silent discard on unknown component. */
1517 if (lut_lookup(Ereportenames_discard
, (void *)class, NULL
)) {
1518 out(O_ALTFP
|O_VERB2
, "Unable to map \"%s\" ereport "
1519 "to component path, but silent discard allowed.",
1523 * XFILE: Failure to find a component is bad unless
1524 * 'discard_if_config_unknown=1' was specified in the
1525 * ereport definition. Indicate undiagnosable.
1527 Undiag_reason
= UD_VAL_NOPATH
;
1528 fmcase
= fmd_case_open(hdl
, NULL
);
1531 * We don't have a component path here (which means that
1532 * the detector was not in hc-scheme and couldn't be
1533 * converted to hc-scheme. Report the raw detector as
1534 * the suspect resource if there is one.
1536 (void) nvlist_lookup_nvlist(nvl
, FM_EREPORT_DETECTOR
,
1538 publish_undiagnosable(hdl
, ffep
, fmcase
, detector
,
1544 ipp
= ipath(epnamenp
);
1545 tree_free(epnamenp
);
1546 fme_receive_report(hdl
, ffep
, class, ipp
, nvl
);
1551 fme_receive_repair_list(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
, nvlist_t
*nvl
,
1552 const char *eventstring
)
1557 const struct ipath
*ipp
;
1559 if (nvlist_lookup_string(nvl
, FM_SUSPECT_UUID
, &uuid
) != 0 ||
1560 nvlist_lookup_nvlist_array(nvl
, FM_SUSPECT_FAULT_LIST
,
1562 out(O_ALTFP
, "No uuid or fault list for list.repaired event");
1566 out(O_ALTFP
, "Processing list.repaired from case %s", uuid
);
1568 while (nvc
-- != 0) {
1570 * Reset any istat or serd engine associated with this path.
1574 if ((ipp
= platform_fault2ipath(*nva
++)) == NULL
)
1577 path
= ipath2str(NULL
, ipp
);
1578 out(O_ALTFP
, "fme_receive_repair_list: resetting state for %s",
1582 lut_walk(Istats
, (lut_cb
)istat_counter_reset_cb
, (void *)ipp
);
1585 lut_walk(SerdEngines
, (lut_cb
)serd_reset_cb
, (void *)ipp
);
1592 fme_receive_topology_change(void)
1594 lut_walk(Istats
, (lut_cb
)istat_counter_topo_chg_cb
, NULL
);
1597 lut_walk(SerdEngines
, (lut_cb
)serd_topo_chg_cb
, NULL
);
1601 static int mark_arrows(struct fme
*fmep
, struct event
*ep
, int mark
,
1602 unsigned long long at_latest_by
, unsigned long long *pdelay
, int keep
);
1606 clear_arrows(struct event
*ep
, struct event
*ep2
, struct fme
*fmep
)
1609 struct arrowlist
*ap
;
1611 ep
->cached_state
= 0;
1612 ep
->keep_in_tree
= 0;
1613 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
1614 bp
= itree_next_bubble(ep
, bp
)) {
1615 if (bp
->t
!= B_FROM
)
1618 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
1619 ap
= itree_next_arrow(bp
, ap
))
1620 ap
->arrowp
->mark
= 0;
1625 fme_receive_report(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
,
1626 const char *eventstring
, const struct ipath
*ipp
, nvlist_t
*nvl
)
1629 struct fme
*fmep
= NULL
;
1630 struct fme
*ofmep
= NULL
;
1631 struct fme
*cfmep
, *svfmep
;
1637 out(O_ALTFP
|O_NONL
, "fme_receive_report: ");
1638 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1639 out(O_ALTFP
|O_STAMP
, NULL
);
1641 /* decide which FME it goes to */
1642 for (fmep
= FMElist
; fmep
; fmep
= fmep
->next
) {
1644 unsigned long long my_delay
= TIMEVAL_EVENTUALLY
;
1645 enum fme_state state
;
1646 nvlist_t
*pre_peek_nvp
= NULL
;
1648 if (fmep
->overflow
) {
1649 if (!(fmd_case_closed(fmep
->hdl
, fmep
->fmcase
)))
1656 * ignore solved or closed cases
1658 if (fmep
->posted_suspects
||
1659 fmd_case_solved(fmep
->hdl
, fmep
->fmcase
) ||
1660 fmd_case_closed(fmep
->hdl
, fmep
->fmcase
))
1663 /* look up event in event tree for this FME */
1664 if ((ep
= itree_lookup(fmep
->eventtree
,
1665 eventstring
, ipp
)) == NULL
)
1668 /* note observation */
1669 fmep
->ecurrent
= ep
;
1670 if (ep
->count
++ == 0) {
1671 /* link it into list of observations seen */
1672 ep
->observations
= fmep
->observations
;
1673 fmep
->observations
= ep
;
1674 ep
->nvp
= evnv_dupnvl(nvl
);
1676 /* use new payload values for peek */
1677 pre_peek_nvp
= ep
->nvp
;
1678 ep
->nvp
= evnv_dupnvl(nvl
);
1681 /* tell hypothesise() not to mess with suspect list */
1684 /* don't want this to be verbose (unless Debug is set) */
1685 prev_verbose
= Verbose
;
1689 lut_walk(fmep
->eventtree
, (lut_cb
)clear_arrows
, (void *)fmep
);
1690 state
= hypothesise(fmep
, fmep
->e0
, fmep
->ull
, &my_delay
);
1694 /* put verbose flag back */
1695 Verbose
= prev_verbose
;
1697 if (state
!= FME_DISPROVED
) {
1698 /* found an FME that explains the ereport */
1700 out(O_ALTFP
|O_NONL
, "[");
1701 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1702 out(O_ALTFP
, " explained by FME%d]", fmep
->id
);
1704 nvlist_free(pre_peek_nvp
);
1707 serialize_observation(fmep
, eventstring
, ipp
);
1710 fmd_case_add_ereport(hdl
, fmep
->fmcase
, ffep
);
1714 stats_counter_bump(fmep
->Rcount
);
1717 fme_eval(fmep
, ffep
);
1720 /* not a match, undo noting of observation */
1721 fmep
->ecurrent
= NULL
;
1722 if (--ep
->count
== 0) {
1723 /* unlink it from observations */
1724 fmep
->observations
= ep
->observations
;
1725 ep
->observations
= NULL
;
1726 nvlist_free(ep
->nvp
);
1729 nvlist_free(ep
->nvp
);
1730 ep
->nvp
= pre_peek_nvp
;
1736 return; /* explained by at least one existing FME */
1738 /* clean up closed fmes */
1740 while (cfmep
!= NULL
) {
1741 svfmep
= cfmep
->next
;
1748 out(O_ALTFP
|O_NONL
, "[");
1749 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1750 out(O_ALTFP
, " ADDING TO OVERFLOW FME]");
1752 fmd_case_add_ereport(hdl
, ofmep
->fmcase
, ffep
);
1756 } else if (Max_fme
&& (Open_fme_count
>= Max_fme
)) {
1757 out(O_ALTFP
|O_NONL
, "[");
1758 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1759 out(O_ALTFP
, " MAX OPEN FME REACHED]");
1761 fmcase
= fmd_case_open(hdl
, NULL
);
1763 /* Create overflow fme */
1764 if ((fmep
= newfme(eventstring
, ipp
, hdl
, fmcase
, ffep
,
1766 out(O_ALTFP
|O_NONL
, "[");
1767 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1768 out(O_ALTFP
, " CANNOT OPEN OVERFLOW FME]");
1774 init_fme_bufs(fmep
);
1775 fmep
->overflow
= B_TRUE
;
1778 fmd_case_add_ereport(hdl
, fmep
->fmcase
, ffep
);
1780 Undiag_reason
= UD_VAL_MAXFME
;
1781 defect
= fmd_nvl_create_fault(hdl
,
1782 undiag_2defect_str(Undiag_reason
), 100, NULL
, NULL
, NULL
);
1783 reason
= undiag_2reason_str(Undiag_reason
, NULL
);
1784 (void) nvlist_add_string(defect
, UNDIAG_REASON
, reason
);
1786 fmd_case_add_suspect(hdl
, fmep
->fmcase
, defect
);
1787 fmd_case_solve(hdl
, fmep
->fmcase
);
1788 Undiag_reason
= UD_VAL_UNKNOWN
;
1793 fmcase
= fmd_case_open(hdl
, NULL
);
1795 /* start a new FME */
1796 if ((fmep
= newfme(eventstring
, ipp
, hdl
, fmcase
, ffep
, nvl
)) == NULL
) {
1797 out(O_ALTFP
|O_NONL
, "[");
1798 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1799 out(O_ALTFP
, " CANNOT DIAGNOSE]");
1805 init_fme_bufs(fmep
);
1807 out(O_ALTFP
|O_NONL
, "[");
1808 ipath_print(O_ALTFP
|O_NONL
, eventstring
, ipp
);
1809 out(O_ALTFP
, " created FME%d, case %s]", fmep
->id
,
1810 fmd_case_uuid(hdl
, fmep
->fmcase
));
1815 /* note observation */
1816 fmep
->ecurrent
= ep
;
1817 if (ep
->count
++ == 0) {
1818 /* link it into list of observations seen */
1819 ep
->observations
= fmep
->observations
;
1820 fmep
->observations
= ep
;
1821 ep
->nvp
= evnv_dupnvl(nvl
);
1822 serialize_observation(fmep
, eventstring
, ipp
);
1824 /* new payload overrides any previous */
1825 nvlist_free(ep
->nvp
);
1826 ep
->nvp
= evnv_dupnvl(nvl
);
1829 stats_counter_bump(fmep
->Rcount
);
1832 fmd_case_add_ereport(hdl
, fmep
->fmcase
, ffep
);
1833 fmd_case_setprincipal(hdl
, fmep
->fmcase
, ffep
);
1838 /* give the diagnosis algorithm a shot at the new FME state */
1839 fme_eval(fmep
, ffep
);
1843 fme_status(int flags
)
1847 if (FMElist
== NULL
) {
1848 out(flags
, "No fault management exercises underway.");
1852 for (fmep
= FMElist
; fmep
; fmep
= fmep
->next
)
1853 fme_print(flags
, fmep
);
1857 * "indent" routines used mostly for nicely formatted debug output, but also
1858 * for sanity checking for infinite recursion bugs.
1861 #define MAX_INDENT 1024
1862 static const char *indent_s
[MAX_INDENT
];
1863 static int current_indent
;
1866 indent_push(const char *s
)
1868 if (current_indent
< MAX_INDENT
)
1869 indent_s
[current_indent
++] = s
;
1871 out(O_DIE
, "unexpected recursion depth (%d)", current_indent
);
1875 indent_set(const char *s
)
1884 if (current_indent
> 0)
1887 out(O_DIE
, "recursion underflow");
1896 for (i
= 0; i
< current_indent
; i
++)
1897 out(O_ALTFP
|O_VERB
|O_NONL
, indent_s
[i
]);
1903 #define SLDISPROVED 4
1906 print_suspects(int circumstance
, struct fme
*fmep
)
1910 out(O_ALTFP
|O_NONL
, "[");
1911 if (circumstance
== SLCHANGED
) {
1912 out(O_ALTFP
|O_NONL
, "FME%d diagnosis changed. state: %s, "
1913 "suspect list:", fmep
->id
, fme_state2str(fmep
->state
));
1914 } else if (circumstance
== SLWAIT
) {
1915 out(O_ALTFP
|O_NONL
, "FME%d set wait timer %ld ", fmep
->id
,
1917 ptree_timeval(O_ALTFP
|O_NONL
, &fmep
->wull
);
1918 } else if (circumstance
== SLDISPROVED
) {
1919 out(O_ALTFP
|O_NONL
, "FME%d DIAGNOSIS UNKNOWN", fmep
->id
);
1921 out(O_ALTFP
|O_NONL
, "FME%d DIAGNOSIS PRODUCED:", fmep
->id
);
1924 if (circumstance
== SLWAIT
|| circumstance
== SLDISPROVED
) {
1929 for (ep
= fmep
->suspects
; ep
; ep
= ep
->suspects
) {
1930 out(O_ALTFP
|O_NONL
, " ");
1931 itree_pevent_brief(O_ALTFP
|O_NONL
, ep
);
1936 static struct node
*
1937 eventprop_lookup(struct event
*ep
, const char *propname
)
1939 return (lut_lookup(ep
->props
, (void *)propname
, NULL
));
1942 #define MAXDIGITIDX 23
1943 static char numbuf
[MAXDIGITIDX
+ 1];
1946 node2uint(struct node
*n
, uint_t
*valp
)
1948 struct evalue value
;
1949 struct lut
*globals
= NULL
;
1955 * check value.v since we are being asked to convert an unsigned
1956 * long long int to an unsigned int
1958 if (! eval_expr(n
, NULL
, NULL
, &globals
, NULL
, NULL
, 0, &value
) ||
1959 value
.t
!= UINT64
|| value
.v
> (1ULL << 32))
1962 *valp
= (uint_t
)value
.v
;
1968 node2fmri(struct node
*n
)
1970 nvlist_t
**pa
, *f
, *p
;
1973 char *numstr
, *nullbyte
;
1977 /* XXX do we need to be able to handle a non-T_NAME node? */
1978 if (n
== NULL
|| n
->t
!= T_NAME
)
1981 for (nc
= n
; nc
!= NULL
; nc
= nc
->u
.name
.next
) {
1982 if (nc
->u
.name
.child
== NULL
|| nc
->u
.name
.child
->t
!= T_NUM
)
1988 /* We bailed early, something went wrong */
1992 if ((err
= nvlist_xalloc(&f
, NV_UNIQUE_NAME
, &Eft_nv_hdl
)) != 0)
1993 out(O_DIE
|O_SYS
, "alloc of fmri nvl failed");
1994 pa
= alloca(depth
* sizeof (nvlist_t
*));
1995 for (i
= 0; i
< depth
; i
++)
1998 err
= nvlist_add_string(f
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_HC
);
1999 err
|= nvlist_add_uint8(f
, FM_VERSION
, FM_HC_SCHEME_VERSION
);
2000 err
|= nvlist_add_string(f
, FM_FMRI_HC_ROOT
, "");
2001 err
|= nvlist_add_uint32(f
, FM_FMRI_HC_LIST_SZ
, depth
);
2003 failure
= "basic construction of FMRI failed";
2007 numbuf
[MAXDIGITIDX
] = '\0';
2008 nullbyte
= &numbuf
[MAXDIGITIDX
];
2011 for (nc
= n
; nc
!= NULL
; nc
= nc
->u
.name
.next
) {
2012 err
= nvlist_xalloc(&p
, NV_UNIQUE_NAME
, &Eft_nv_hdl
);
2014 failure
= "alloc of an hc-pair failed";
2017 err
= nvlist_add_string(p
, FM_FMRI_HC_NAME
, nc
->u
.name
.s
);
2018 numstr
= ulltostr(nc
->u
.name
.child
->u
.ull
, nullbyte
);
2019 err
|= nvlist_add_string(p
, FM_FMRI_HC_ID
, numstr
);
2021 failure
= "construction of an hc-pair failed";
2027 err
= nvlist_add_nvlist_array(f
, FM_FMRI_HC_LIST
, pa
, depth
);
2029 for (i
= 0; i
< depth
; i
++)
2033 failure
= "addition of hc-pair array to FMRI failed";
2036 for (i
= 0; i
< depth
; i
++)
2039 out(O_DIE
, "%s", failure
);
2044 /* an ipath cache entry is an array of these, with s==NULL at the end */
2046 const char *s
; /* component name (in stable) */
2047 int i
; /* instance number */
2051 ipath2fmri(struct ipath
*ipath
)
2053 nvlist_t
**pa
, *f
, *p
;
2055 char *numstr
, *nullbyte
;
2060 for (ipp
= ipath
; ipp
->s
!= NULL
; ipp
++)
2063 if ((err
= nvlist_xalloc(&f
, NV_UNIQUE_NAME
, &Eft_nv_hdl
)) != 0)
2064 out(O_DIE
|O_SYS
, "alloc of fmri nvl failed");
2065 pa
= alloca(depth
* sizeof (nvlist_t
*));
2066 for (i
= 0; i
< depth
; i
++)
2069 err
= nvlist_add_string(f
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_HC
);
2070 err
|= nvlist_add_uint8(f
, FM_VERSION
, FM_HC_SCHEME_VERSION
);
2071 err
|= nvlist_add_string(f
, FM_FMRI_HC_ROOT
, "");
2072 err
|= nvlist_add_uint32(f
, FM_FMRI_HC_LIST_SZ
, depth
);
2074 failure
= "basic construction of FMRI failed";
2078 numbuf
[MAXDIGITIDX
] = '\0';
2079 nullbyte
= &numbuf
[MAXDIGITIDX
];
2082 for (ipp
= ipath
; ipp
->s
!= NULL
; ipp
++) {
2083 err
= nvlist_xalloc(&p
, NV_UNIQUE_NAME
, &Eft_nv_hdl
);
2085 failure
= "alloc of an hc-pair failed";
2088 err
= nvlist_add_string(p
, FM_FMRI_HC_NAME
, ipp
->s
);
2089 numstr
= ulltostr(ipp
->i
, nullbyte
);
2090 err
|= nvlist_add_string(p
, FM_FMRI_HC_ID
, numstr
);
2092 failure
= "construction of an hc-pair failed";
2098 err
= nvlist_add_nvlist_array(f
, FM_FMRI_HC_LIST
, pa
, depth
);
2100 for (i
= 0; i
< depth
; i
++)
2104 failure
= "addition of hc-pair array to FMRI failed";
2107 for (i
= 0; i
< depth
; i
++)
2110 out(O_DIE
, "%s", failure
);
2116 percentof(uint_t part
, uint_t whole
)
2118 unsigned long long p
= part
* 1000;
2120 return ((p
/ whole
/ 10) + (((p
/ whole
% 10) >= 5) ? 1 : 0));
2124 struct event
*suspect
;
2130 static void publish_suspects(struct fme
*fmep
, struct rsl
*srl
);
2133 * rslfree -- free internal members of struct rsl not expected to be
2137 rslfree(struct rsl
*freeme
)
2139 nvlist_free(freeme
->asru
);
2140 nvlist_free(freeme
->fru
);
2141 if (freeme
->rsrc
!= freeme
->asru
)
2142 nvlist_free(freeme
->rsrc
);
2146 * rslcmp -- compare two rsl structures. Use the following
2147 * comparisons to establish cardinality:
2149 * 1. Name of the suspect's class. (simple strcmp)
2150 * 2. Name of the suspect's ASRU. (trickier, since nvlist)
2154 rslcmp(const void *a
, const void *b
)
2156 struct rsl
*r1
= (struct rsl
*)a
;
2157 struct rsl
*r2
= (struct rsl
*)b
;
2160 rv
= strcmp(r1
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2161 r2
->suspect
->enode
->u
.event
.ename
->u
.name
.s
);
2165 if (r1
->rsrc
== NULL
&& r2
->rsrc
== NULL
)
2167 if (r1
->rsrc
== NULL
)
2169 if (r2
->rsrc
== NULL
)
2171 return (evnv_cmpnvl(r1
->rsrc
, r2
->rsrc
, 0));
2175 * get_resources -- for a given suspect, determine what ASRU, FRU and
2176 * RSRC nvlists should be advertised in the final suspect list.
2179 get_resources(struct event
*sp
, struct rsl
*rsrcs
, struct config
*croot
)
2181 struct node
*asrudef
, *frudef
;
2182 nvlist_t
*asru
, *fru
;
2183 nvlist_t
*rsrc
= NULL
;
2187 * First find any ASRU and/or FRU defined in the
2188 * initial fault tree.
2190 asrudef
= eventprop_lookup(sp
, L_ASRU
);
2191 frudef
= eventprop_lookup(sp
, L_FRU
);
2194 * Create FMRIs based on those definitions
2196 asru
= node2fmri(asrudef
);
2197 fru
= node2fmri(frudef
);
2198 pathstr
= ipath2str(NULL
, sp
->ipp
);
2201 * Allow for platform translations of the FMRIs
2203 platform_units_translate(is_defect(sp
->t
), croot
, &asru
, &fru
, &rsrc
,
2207 rsrcs
->suspect
= sp
;
2214 * trim_suspects -- prior to publishing, we may need to remove some
2215 * suspects from the list. If we're auto-closing upsets, we don't
2216 * want any of those in the published list. If the ASRUs for multiple
2217 * defects resolve to the same ASRU (driver) we only want to publish
2218 * that as a single suspect.
2221 trim_suspects(struct fme
*fmep
, struct rsl
*begin
, struct rsl
*begin2
,
2225 struct rsl
*rp
= begin
;
2226 struct rsl
*rp2
= begin2
;
2227 int mess_zero_count
= 0;
2231 /* remove any unwanted upsets and populate our array */
2232 for (ep
= fmep
->psuspects
; ep
; ep
= ep
->psuspects
) {
2233 if (is_upset(ep
->t
))
2235 serd_rval
= serd_eval(fmep
, fmep
->hdl
, ffep
, fmep
->fmcase
, ep
,
2239 if (node2uint(eventprop_lookup(ep
, L_message
),
2240 &messval
) == 0 && messval
== 0) {
2241 get_resources(ep
, rp2
, fmep
->config
);
2245 get_resources(ep
, rp
, fmep
->config
);
2250 return (mess_zero_count
);
2254 * addpayloadprop -- add a payload prop to a problem
2257 addpayloadprop(const char *lhs
, struct evalue
*rhs
, nvlist_t
*fault
)
2259 nvlist_t
*rsrc
, *hcs
;
2261 ASSERT(fault
!= NULL
);
2262 ASSERT(lhs
!= NULL
);
2263 ASSERT(rhs
!= NULL
);
2265 if (nvlist_lookup_nvlist(fault
, FM_FAULT_RESOURCE
, &rsrc
) != 0)
2266 out(O_DIE
, "cannot add payloadprop \"%s\" to fault", lhs
);
2268 if (nvlist_lookup_nvlist(rsrc
, FM_FMRI_HC_SPECIFIC
, &hcs
) != 0) {
2269 out(O_ALTFP
|O_VERB2
, "addpayloadprop: create hc_specific");
2270 if (nvlist_xalloc(&hcs
, NV_UNIQUE_NAME
, &Eft_nv_hdl
) != 0)
2272 "cannot add payloadprop \"%s\" to fault", lhs
);
2273 if (nvlist_add_nvlist(rsrc
, FM_FMRI_HC_SPECIFIC
, hcs
) != 0)
2275 "cannot add payloadprop \"%s\" to fault", lhs
);
2277 if (nvlist_lookup_nvlist(rsrc
, FM_FMRI_HC_SPECIFIC
, &hcs
) != 0)
2279 "cannot add payloadprop \"%s\" to fault", lhs
);
2281 out(O_ALTFP
|O_VERB2
, "addpayloadprop: reuse hc_specific");
2283 if (rhs
->t
== UINT64
) {
2284 out(O_ALTFP
|O_VERB2
, "addpayloadprop: %s=%llu", lhs
, rhs
->v
);
2286 if (nvlist_add_uint64(hcs
, lhs
, rhs
->v
) != 0)
2288 "cannot add payloadprop \"%s\" to fault", lhs
);
2290 out(O_ALTFP
|O_VERB2
, "addpayloadprop: %s=\"%s\"",
2291 lhs
, (char *)(uintptr_t)rhs
->v
);
2293 if (nvlist_add_string(hcs
, lhs
, (char *)(uintptr_t)rhs
->v
) != 0)
2295 "cannot add payloadprop \"%s\" to fault", lhs
);
2299 static char *Istatbuf
;
2300 static char *Istatbufptr
;
2304 * istataddsize -- calculate size of istat and add it to Istatsz
2308 istataddsize(const struct istat_entry
*lhs
, struct stats
*rhs
, void *arg
)
2312 ASSERT(lhs
!= NULL
);
2313 ASSERT(rhs
!= NULL
);
2315 if ((val
= stats_counter_value(rhs
)) == 0)
2316 return; /* skip zero-valued stats */
2318 /* count up the size of the stat name */
2319 Istatsz
+= ipath2strlen(lhs
->ename
, lhs
->ipath
);
2320 Istatsz
++; /* for the trailing NULL byte */
2322 /* count up the size of the stat value */
2323 Istatsz
+= snprintf(NULL
, 0, "%d", val
);
2324 Istatsz
++; /* for the trailing NULL byte */
2328 * istat2str -- serialize an istat, writing result to *Istatbufptr
2332 istat2str(const struct istat_entry
*lhs
, struct stats
*rhs
, void *arg
)
2338 ASSERT(lhs
!= NULL
);
2339 ASSERT(rhs
!= NULL
);
2341 if ((val
= stats_counter_value(rhs
)) == 0)
2342 return; /* skip zero-valued stats */
2344 /* serialize the stat name */
2345 str
= ipath2str(lhs
->ename
, lhs
->ipath
);
2348 ASSERT(Istatbufptr
+ len
+ 1 < &Istatbuf
[Istatsz
]);
2349 (void) strlcpy(Istatbufptr
, str
, &Istatbuf
[Istatsz
] - Istatbufptr
);
2352 *Istatbufptr
++ = '\0';
2354 /* serialize the stat value */
2355 Istatbufptr
+= snprintf(Istatbufptr
, &Istatbuf
[Istatsz
] - Istatbufptr
,
2357 *Istatbufptr
++ = '\0';
2359 ASSERT(Istatbufptr
<= &Istatbuf
[Istatsz
]);
2365 if (Istat_need_save
== 0)
2368 /* figure out how big the serialzed info is */
2370 lut_walk(Istats
, (lut_cb
)istataddsize
, NULL
);
2373 /* no stats to save */
2374 fmd_buf_destroy(Hdl
, NULL
, WOBUF_ISTATS
);
2378 /* create the serialized buffer */
2379 Istatbufptr
= Istatbuf
= MALLOC(Istatsz
);
2380 lut_walk(Istats
, (lut_cb
)istat2str
, NULL
);
2382 /* clear out current saved stats */
2383 fmd_buf_destroy(Hdl
, NULL
, WOBUF_ISTATS
);
2385 /* write out the new version */
2386 fmd_buf_write(Hdl
, NULL
, WOBUF_ISTATS
, Istatbuf
, Istatsz
);
2389 Istat_need_save
= 0;
2393 istat_cmp(struct istat_entry
*ent1
, struct istat_entry
*ent2
)
2395 if (ent1
->ename
!= ent2
->ename
)
2396 return (ent2
->ename
- ent1
->ename
);
2397 if (ent1
->ipath
!= ent2
->ipath
)
2398 return ((char *)ent2
->ipath
- (char *)ent1
->ipath
);
2404 * istat-verify -- verify the component associated with a stat still exists
2406 * if the component no longer exists, this routine resets the stat and
2407 * returns 0. if the component still exists, it returns 1.
2410 istat_verify(struct node
*snp
, struct istat_entry
*entp
)
2412 struct stats
*statp
;
2415 fmri
= node2fmri(snp
->u
.event
.epname
);
2416 if (platform_path_exists(fmri
)) {
2422 /* component no longer in system. zero out the associated stats */
2423 if ((statp
= (struct stats
*)
2424 lut_lookup(Istats
, entp
, (lut_cmp
)istat_cmp
)) == NULL
||
2425 stats_counter_value(statp
) == 0)
2426 return (0); /* stat is already reset */
2428 Istat_need_save
= 1;
2429 stats_counter_reset(statp
);
2434 istat_bump(struct node
*snp
, int n
)
2436 struct stats
*statp
;
2437 struct istat_entry ent
;
2439 ASSERT(snp
!= NULL
);
2440 ASSERTinfo(snp
->t
== T_EVENT
, ptree_nodetype2str(snp
->t
));
2441 ASSERT(snp
->u
.event
.epname
!= NULL
);
2443 /* class name should be hoisted into a single stable entry */
2444 ASSERT(snp
->u
.event
.ename
->u
.name
.next
== NULL
);
2445 ent
.ename
= snp
->u
.event
.ename
->u
.name
.s
;
2446 ent
.ipath
= ipath(snp
->u
.event
.epname
);
2448 if (!istat_verify(snp
, &ent
)) {
2449 /* component no longer exists in system, nothing to do */
2453 if ((statp
= (struct stats
*)
2454 lut_lookup(Istats
, &ent
, (lut_cmp
)istat_cmp
)) == NULL
) {
2455 /* need to create the counter */
2460 struct istat_entry
*newentp
;
2462 /* count up the size of the stat name */
2463 np
= snp
->u
.event
.ename
;
2464 while (np
!= NULL
) {
2465 cnt
+= strlen(np
->u
.name
.s
);
2466 cnt
++; /* for the '.' or '@' */
2467 np
= np
->u
.name
.next
;
2469 np
= snp
->u
.event
.epname
;
2470 while (np
!= NULL
) {
2471 cnt
+= snprintf(NULL
, 0, "%s%llu",
2472 np
->u
.name
.s
, np
->u
.name
.child
->u
.ull
);
2473 cnt
++; /* for the '/' or trailing NULL byte */
2474 np
= np
->u
.name
.next
;
2477 /* build the stat name */
2478 snamep
= sname
= alloca(cnt
);
2479 np
= snp
->u
.event
.ename
;
2480 while (np
!= NULL
) {
2481 snamep
+= snprintf(snamep
, &sname
[cnt
] - snamep
,
2482 "%s", np
->u
.name
.s
);
2483 np
= np
->u
.name
.next
;
2488 np
= snp
->u
.event
.epname
;
2489 while (np
!= NULL
) {
2490 snamep
+= snprintf(snamep
, &sname
[cnt
] - snamep
,
2491 "%s%llu", np
->u
.name
.s
, np
->u
.name
.child
->u
.ull
);
2492 np
= np
->u
.name
.next
;
2498 /* create the new stat & add it to our list */
2499 newentp
= MALLOC(sizeof (*newentp
));
2501 statp
= stats_new_counter(NULL
, sname
, 0);
2502 Istats
= lut_add(Istats
, (void *)newentp
, (void *)statp
,
2503 (lut_cmp
)istat_cmp
);
2506 /* if n is non-zero, set that value instead of bumping */
2508 stats_counter_reset(statp
);
2509 stats_counter_add(statp
, n
);
2511 stats_counter_bump(statp
);
2512 Istat_need_save
= 1;
2514 ipath_print(O_ALTFP
|O_VERB2
, ent
.ename
, ent
.ipath
);
2515 out(O_ALTFP
|O_VERB2
, " %s to value %d", n
? "set" : "incremented",
2516 stats_counter_value(statp
));
2521 istat_destructor(void *left
, void *right
, void *arg
)
2523 struct istat_entry
*entp
= (struct istat_entry
*)left
;
2524 struct stats
*statp
= (struct stats
*)right
;
2526 stats_delete(statp
);
2530 * Callback used in a walk of the Istats to reset matching stat counters.
2533 istat_counter_reset_cb(struct istat_entry
*entp
, struct stats
*statp
,
2534 const struct ipath
*ipp
)
2538 if (entp
->ipath
== ipp
) {
2539 path
= ipath2str(entp
->ename
, ipp
);
2540 out(O_ALTFP
, "istat_counter_reset_cb: resetting %s", path
);
2542 stats_counter_reset(statp
);
2543 Istat_need_save
= 1;
2549 istat_counter_topo_chg_cb(struct istat_entry
*entp
, struct stats
*statp
,
2555 fmri
= ipath2fmri((struct ipath
*)(entp
->ipath
));
2556 if (!platform_path_exists(fmri
)) {
2557 path
= ipath2str(entp
->ename
, entp
->ipath
);
2558 out(O_ALTFP
, "istat_counter_topo_chg_cb: not present %s", path
);
2560 stats_counter_reset(statp
);
2561 Istat_need_save
= 1;
2569 lut_free(Istats
, istat_destructor
, NULL
);
2572 static char *Serdbuf
;
2573 static char *Serdbufptr
;
2577 * serdaddsize -- calculate size of serd and add it to Serdsz
2581 serdaddsize(const struct serd_entry
*lhs
, struct stats
*rhs
, void *arg
)
2583 ASSERT(lhs
!= NULL
);
2585 /* count up the size of the stat name */
2586 Serdsz
+= ipath2strlen(lhs
->ename
, lhs
->ipath
);
2587 Serdsz
++; /* for the trailing NULL byte */
2591 * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2595 serd2str(const struct serd_entry
*lhs
, struct stats
*rhs
, void *arg
)
2600 ASSERT(lhs
!= NULL
);
2602 /* serialize the serd engine name */
2603 str
= ipath2str(lhs
->ename
, lhs
->ipath
);
2606 ASSERT(Serdbufptr
+ len
+ 1 <= &Serdbuf
[Serdsz
]);
2607 (void) strlcpy(Serdbufptr
, str
, &Serdbuf
[Serdsz
] - Serdbufptr
);
2610 *Serdbufptr
++ = '\0';
2611 ASSERT(Serdbufptr
<= &Serdbuf
[Serdsz
]);
2617 if (Serd_need_save
== 0)
2620 /* figure out how big the serialzed info is */
2622 lut_walk(SerdEngines
, (lut_cb
)serdaddsize
, NULL
);
2625 /* no serd engines to save */
2626 fmd_buf_destroy(Hdl
, NULL
, WOBUF_SERDS
);
2630 /* create the serialized buffer */
2631 Serdbufptr
= Serdbuf
= MALLOC(Serdsz
);
2632 lut_walk(SerdEngines
, (lut_cb
)serd2str
, NULL
);
2634 /* clear out current saved stats */
2635 fmd_buf_destroy(Hdl
, NULL
, WOBUF_SERDS
);
2637 /* write out the new version */
2638 fmd_buf_write(Hdl
, NULL
, WOBUF_SERDS
, Serdbuf
, Serdsz
);
2644 serd_cmp(struct serd_entry
*ent1
, struct serd_entry
*ent2
)
2646 if (ent1
->ename
!= ent2
->ename
)
2647 return (ent2
->ename
- ent1
->ename
);
2648 if (ent1
->ipath
!= ent2
->ipath
)
2649 return ((char *)ent2
->ipath
- (char *)ent1
->ipath
);
2655 fme_serd_load(fmd_hdl_t
*hdl
)
2661 struct serd_entry
*newentp
;
2662 struct node
*epname
;
2666 if ((sz
= fmd_buf_size(hdl
, NULL
, WOBUF_SERDS
)) == 0)
2669 fmd_buf_read(hdl
, NULL
, WOBUF_SERDS
, sbuf
, sz
);
2671 while (ptr
< &sbuf
[sz
]) {
2672 sepptr
= strchr(ptr
, '@');
2678 ptr
++; /* move past the '\0' separating paths */
2679 epname
= pathstring2epnamenp(sepptr
);
2680 fmri
= node2fmri(epname
);
2681 if (platform_path_exists(fmri
)) {
2682 newentp
= MALLOC(sizeof (*newentp
));
2684 newentp
->ipath
= ipath(epname
);
2685 newentp
->ename
= stable(namestring
);
2686 SerdEngines
= lut_add(SerdEngines
, (void *)newentp
,
2687 (void *)newentp
, (lut_cmp
)serd_cmp
);
2693 /* save it back again in case some of the paths no longer exist */
2699 serd_destructor(void *left
, void *right
, void *arg
)
2701 struct serd_entry
*entp
= (struct serd_entry
*)left
;
2706 * Callback used in a walk of the SerdEngines to reset matching serd engines.
2710 serd_reset_cb(struct serd_entry
*entp
, void *unused
, const struct ipath
*ipp
)
2714 if (entp
->ipath
== ipp
) {
2715 path
= ipath2str(entp
->ename
, ipp
);
2716 out(O_ALTFP
, "serd_reset_cb: resetting %s", path
);
2717 fmd_serd_reset(entp
->hdl
, path
);
2725 serd_topo_chg_cb(struct serd_entry
*entp
, void *unused
, void *unused2
)
2730 fmri
= ipath2fmri((struct ipath
*)(entp
->ipath
));
2731 if (!platform_path_exists(fmri
)) {
2732 path
= ipath2str(entp
->ename
, entp
->ipath
);
2733 out(O_ALTFP
, "serd_topo_chg_cb: not present %s", path
);
2734 fmd_serd_reset(entp
->hdl
, path
);
2744 lut_free(SerdEngines
, serd_destructor
, NULL
);
2748 publish_suspects(struct fme
*fmep
, struct rsl
*srl
)
2760 boolean_t allfaulty
= B_TRUE
;
2761 struct rsl
*erl
= srl
+ fmep
->nsuspects
- 1;
2766 qsort(srl
, fmep
->nsuspects
, sizeof (struct rsl
), rslcmp
);
2768 /* sum the fitrates */
2769 frs
= alloca(fmep
->nsuspects
* sizeof (uint_t
));
2770 fridx
= frcnt
= frsum
= 0;
2772 for (rp
= srl
; rp
<= erl
; rp
++) {
2775 n
= eventprop_lookup(rp
->suspect
, L_FITrate
);
2776 if (node2uint(n
, &fr
) != 0) {
2777 out(O_DEBUG
|O_NONL
, "event ");
2778 ipath_print(O_DEBUG
|O_NONL
,
2779 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2781 out(O_VERB
, " has no FITrate (using 1)");
2783 } else if (fr
== 0) {
2784 out(O_DEBUG
|O_NONL
, "event ");
2785 ipath_print(O_DEBUG
|O_NONL
,
2786 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2788 out(O_VERB
, " has zero FITrate (using 1)");
2797 /* Add them in reverse order of our sort, as fmd reverses order */
2798 for (rp
= erl
; rp
>= srl
; rp
--) {
2799 cert
= percentof(frs
[--fridx
], frsum
);
2800 fault
= fmd_nvl_create_fault(fmep
->hdl
,
2801 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2807 out(O_DIE
, "fault creation failed");
2808 /* if "message" property exists, add it to the fault */
2809 if (node2uint(eventprop_lookup(rp
->suspect
, L_message
),
2813 "[FME%d, %s adds message=%d to suspect list]",
2815 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2817 if (nvlist_add_boolean_value(fault
,
2819 (messval
) ? B_TRUE
: B_FALSE
) != 0) {
2820 out(O_DIE
, "cannot add no-message to fault");
2824 /* if "retire" property exists, add it to the fault */
2825 if (node2uint(eventprop_lookup(rp
->suspect
, L_retire
),
2829 "[FME%d, %s adds retire=%d to suspect list]",
2831 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2833 if (nvlist_add_boolean_value(fault
,
2835 (retireval
) ? B_TRUE
: B_FALSE
) != 0) {
2836 out(O_DIE
, "cannot add no-retire to fault");
2840 /* if "response" property exists, add it to the fault */
2841 if (node2uint(eventprop_lookup(rp
->suspect
, L_response
),
2842 &responseval
) == 0) {
2845 "[FME%d, %s adds response=%d to suspect list]",
2847 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
,
2849 if (nvlist_add_boolean_value(fault
,
2850 FM_SUSPECT_RESPONSE
,
2851 (responseval
) ? B_TRUE
: B_FALSE
) != 0) {
2852 out(O_DIE
, "cannot add no-response to fault");
2856 /* add any payload properties */
2857 lut_walk(rp
->suspect
->payloadprops
,
2858 (lut_cb
)addpayloadprop
, (void *)fault
);
2862 * If "action" property exists, evaluate it; this must be done
2863 * before the allfaulty check below since some actions may
2864 * modify the asru to be used in fmd_nvl_fmri_has_fault. This
2865 * needs to be restructured if any new actions are introduced
2866 * that have effects that we do not want to be visible if
2867 * we decide not to publish in the dupclose check below.
2869 if ((snp
= eventprop_lookup(rp
->suspect
, L_action
)) != NULL
) {
2870 struct evalue evalue
;
2873 "[FME%d, %s action ", fmep
->id
,
2874 rp
->suspect
->enode
->u
.event
.ename
->u
.name
.s
);
2875 ptree_name_iter(O_ALTFP
|O_NONL
, snp
);
2878 (void) eval_expr(snp
, NULL
, NULL
, NULL
, NULL
,
2882 fmd_case_add_suspect(fmep
->hdl
, fmep
->fmcase
, fault
);
2885 * check if the asru is already marked as "faulty".
2890 out(O_ALTFP
|O_VERB
, "FME%d dup check ", fmep
->id
);
2891 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, rp
->suspect
);
2892 out(O_ALTFP
|O_VERB
|O_NONL
, " ");
2893 if (nvlist_lookup_nvlist(fault
,
2894 FM_FAULT_ASRU
, &asru
) != 0) {
2895 out(O_ALTFP
|O_VERB
, "NULL asru");
2896 allfaulty
= B_FALSE
;
2897 } else if (fmd_nvl_fmri_has_fault(fmep
->hdl
, asru
,
2898 FMD_HAS_FAULT_ASRU
, NULL
)) {
2899 out(O_ALTFP
|O_VERB
, "faulty");
2901 out(O_ALTFP
|O_VERB
, "not faulty");
2902 allfaulty
= B_FALSE
;
2910 * don't update the count stat if all asrus are already
2911 * present and unrepaired in the asru cache
2913 for (rp
= erl
; rp
>= srl
; rp
--) {
2914 struct event
*suspect
= rp
->suspect
;
2916 if (suspect
== NULL
)
2919 /* if "count" exists, increment the appropriate stat */
2920 if ((snp
= eventprop_lookup(suspect
,
2921 L_count
)) != NULL
) {
2923 "[FME%d, %s count ", fmep
->id
,
2924 suspect
->enode
->u
.event
.ename
->u
.name
.s
);
2925 ptree_name_iter(O_ALTFP
|O_NONL
, snp
);
2931 istat_save(); /* write out any istat changes */
2936 undiag_2defect_str(int ud
)
2939 case UD_VAL_MISSINGINFO
:
2940 case UD_VAL_MISSINGOBS
:
2941 case UD_VAL_MISSINGPATH
:
2942 case UD_VAL_MISSINGZERO
:
2944 case UD_VAL_CFGMISMATCH
:
2945 return (UNDIAG_DEFECT_CHKPT
);
2947 case UD_VAL_BADEVENTI
:
2948 case UD_VAL_BADEVENTPATH
:
2949 case UD_VAL_BADEVENTCLASS
:
2950 case UD_VAL_INSTFAIL
:
2952 case UD_VAL_UNSOLVD
:
2953 return (UNDIAG_DEFECT_FME
);
2956 return (UNDIAG_DEFECT_LIMIT
);
2958 case UD_VAL_UNKNOWN
:
2960 return (UNDIAG_DEFECT_UNKNOWN
);
2965 undiag_2fault_str(int ud
)
2968 case UD_VAL_BADEVENTI
:
2969 case UD_VAL_BADEVENTPATH
:
2970 case UD_VAL_BADEVENTCLASS
:
2971 case UD_VAL_INSTFAIL
:
2973 case UD_VAL_UNSOLVD
:
2974 return (UNDIAG_FAULT_FME
);
2981 undiag_2reason_str(int ud
, char *arg
)
2988 case UD_VAL_BADEVENTPATH
:
2989 ptr
= UD_STR_BADEVENTPATH
;
2992 case UD_VAL_BADEVENTCLASS
:
2993 ptr
= UD_STR_BADEVENTCLASS
;
2996 case UD_VAL_BADEVENTI
:
2997 ptr
= UD_STR_BADEVENTI
;
3001 ptr
= UD_STR_BADOBS
;
3003 case UD_VAL_CFGMISMATCH
:
3004 ptr
= UD_STR_CFGMISMATCH
;
3006 case UD_VAL_INSTFAIL
:
3007 ptr
= UD_STR_INSTFAIL
;
3011 ptr
= UD_STR_MAXFME
;
3013 case UD_VAL_MISSINGINFO
:
3014 ptr
= UD_STR_MISSINGINFO
;
3016 case UD_VAL_MISSINGOBS
:
3017 ptr
= UD_STR_MISSINGOBS
;
3019 case UD_VAL_MISSINGPATH
:
3020 ptr
= UD_STR_MISSINGPATH
;
3022 case UD_VAL_MISSINGZERO
:
3023 ptr
= UD_STR_MISSINGZERO
;
3026 ptr
= UD_STR_NOPATH
;
3029 case UD_VAL_UNSOLVD
:
3030 ptr
= UD_STR_UNSOLVD
;
3032 case UD_VAL_UNKNOWN
:
3034 ptr
= UD_STR_UNKNOWN
;
3038 buf
= MALLOC(strlen(ptr
) + strlen(arg
) - 1);
3039 (void) sprintf(buf
, ptr
, arg
);
3041 buf
= MALLOC(strlen(ptr
) + 1);
3042 (void) sprintf(buf
, ptr
);
3048 publish_undiagnosable(fmd_hdl_t
*hdl
, fmd_event_t
*ffep
, fmd_case_t
*fmcase
,
3049 nvlist_t
*detector
, char *arg
)
3051 struct case_list
*newcase
;
3052 nvlist_t
*defect
, *fault
;
3053 const char *faultstr
;
3054 char *reason
= undiag_2reason_str(Undiag_reason
, arg
);
3057 "[undiagnosable ereport received, "
3058 "creating and closing a new case (%s)]", reason
);
3060 newcase
= MALLOC(sizeof (struct case_list
));
3061 newcase
->next
= NULL
;
3062 newcase
->fmcase
= fmcase
;
3063 if (Undiagablecaselist
!= NULL
)
3064 newcase
->next
= Undiagablecaselist
;
3065 Undiagablecaselist
= newcase
;
3068 fmd_case_add_ereport(hdl
, newcase
->fmcase
, ffep
);
3071 defect
= fmd_nvl_create_fault(hdl
,
3072 undiag_2defect_str(Undiag_reason
), 50, NULL
, NULL
, detector
);
3073 (void) nvlist_add_string(defect
, UNDIAG_REASON
, reason
);
3074 (void) nvlist_add_boolean_value(defect
, FM_SUSPECT_RETIRE
, B_FALSE
);
3075 (void) nvlist_add_boolean_value(defect
, FM_SUSPECT_RESPONSE
, B_FALSE
);
3076 fmd_case_add_suspect(hdl
, newcase
->fmcase
, defect
);
3078 /* add fault if appropriate */
3079 faultstr
= undiag_2fault_str(Undiag_reason
);
3080 if (faultstr
!= NULL
) {
3081 fault
= fmd_nvl_create_fault(hdl
, faultstr
, 50, NULL
, NULL
,
3083 (void) nvlist_add_string(fault
, UNDIAG_REASON
, reason
);
3084 (void) nvlist_add_boolean_value(fault
, FM_SUSPECT_RETIRE
,
3086 (void) nvlist_add_boolean_value(fault
, FM_SUSPECT_RESPONSE
,
3088 fmd_case_add_suspect(hdl
, newcase
->fmcase
, fault
);
3092 /* solve and close case */
3093 fmd_case_solve(hdl
, newcase
->fmcase
);
3094 fmd_case_close(hdl
, newcase
->fmcase
);
3095 Undiag_reason
= UD_VAL_UNKNOWN
;
3099 fme_undiagnosable(struct fme
*f
)
3101 nvlist_t
*defect
, *fault
, *detector
= NULL
;
3104 const char *faultstr
;
3105 char *reason
= undiag_2reason_str(Undiag_reason
, NULL
);
3107 out(O_ALTFP
, "[solving/closing FME%d, case %s (%s)]",
3108 f
->id
, fmd_case_uuid(f
->hdl
, f
->fmcase
), reason
);
3110 for (ep
= f
->observations
; ep
; ep
= ep
->observations
) {
3112 if (ep
->ffep
!= f
->e0r
)
3113 fmd_case_add_ereport(f
->hdl
, f
->fmcase
, ep
->ffep
);
3115 pathstr
= ipath2str(NULL
, ipath(platform_getpath(ep
->nvp
)));
3116 platform_units_translate(0, f
->config
, NULL
, NULL
, &detector
,
3121 defect
= fmd_nvl_create_fault(f
->hdl
,
3122 undiag_2defect_str(Undiag_reason
), 50 / f
->uniqobs
,
3123 NULL
, NULL
, detector
);
3124 (void) nvlist_add_string(defect
, UNDIAG_REASON
, reason
);
3125 (void) nvlist_add_boolean_value(defect
, FM_SUSPECT_RETIRE
,
3127 (void) nvlist_add_boolean_value(defect
, FM_SUSPECT_RESPONSE
,
3129 fmd_case_add_suspect(f
->hdl
, f
->fmcase
, defect
);
3131 /* add fault if appropriate */
3132 faultstr
= undiag_2fault_str(Undiag_reason
);
3133 if (faultstr
== NULL
)
3135 fault
= fmd_nvl_create_fault(f
->hdl
, faultstr
, 50 / f
->uniqobs
,
3136 NULL
, NULL
, detector
);
3137 (void) nvlist_add_string(fault
, UNDIAG_REASON
, reason
);
3138 (void) nvlist_add_boolean_value(fault
, FM_SUSPECT_RETIRE
,
3140 (void) nvlist_add_boolean_value(fault
, FM_SUSPECT_RESPONSE
,
3142 fmd_case_add_suspect(f
->hdl
, f
->fmcase
, fault
);
3143 nvlist_free(detector
);
3146 fmd_case_solve(f
->hdl
, f
->fmcase
);
3147 fmd_case_close(f
->hdl
, f
->fmcase
);
3148 Undiag_reason
= UD_VAL_UNKNOWN
;
3154 * Find the requested case amongst our fmes and close it. Free up
3158 fme_close_case(fmd_hdl_t
*hdl
, fmd_case_t
*fmcase
)
3160 struct case_list
*ucasep
, *prevcasep
= NULL
;
3161 struct fme
*prev
= NULL
;
3164 for (ucasep
= Undiagablecaselist
; ucasep
; ucasep
= ucasep
->next
) {
3165 if (fmcase
!= ucasep
->fmcase
) {
3170 if (prevcasep
== NULL
)
3171 Undiagablecaselist
= Undiagablecaselist
->next
;
3173 prevcasep
->next
= ucasep
->next
;
3179 for (fmep
= FMElist
; fmep
; fmep
= fmep
->next
) {
3180 if (fmep
->hdl
== hdl
&& fmep
->fmcase
== fmcase
)
3186 out(O_WARN
, "Eft asked to close unrecognized case [%s].",
3187 fmd_case_uuid(hdl
, fmcase
));
3191 if (EFMElist
== fmep
)
3195 FMElist
= FMElist
->next
;
3197 prev
->next
= fmep
->next
;
3201 /* Get rid of any timer this fme has set */
3202 if (fmep
->wull
!= 0)
3203 fmd_timer_remove(fmep
->hdl
, fmep
->timer
);
3205 if (ClosedFMEs
== NULL
) {
3208 fmep
->next
= ClosedFMEs
;
3214 /* See if we can close the overflow FME */
3215 if (Open_fme_count
<= Max_fme
) {
3216 for (fmep
= FMElist
; fmep
; fmep
= fmep
->next
) {
3217 if (fmep
->overflow
&& !(fmd_case_closed(fmep
->hdl
,
3223 fmd_case_close(fmep
->hdl
, fmep
->fmcase
);
3229 * If the time we need to wait for the given FME is less than the
3230 * current timer, kick that old timer out and establish a new one.
3233 fme_set_timer(struct fme
*fmep
, unsigned long long wull
)
3235 out(O_ALTFP
|O_VERB
|O_NONL
, " fme_set_timer: request to wait ");
3236 ptree_timeval(O_ALTFP
|O_VERB
, &wull
);
3238 if (wull
<= fmep
->pull
) {
3239 out(O_ALTFP
|O_VERB
|O_NONL
, "already have waited at least ");
3240 ptree_timeval(O_ALTFP
|O_VERB
, &fmep
->pull
);
3241 out(O_ALTFP
|O_VERB
, NULL
);
3242 /* we've waited at least wull already, don't need timer */
3246 out(O_ALTFP
|O_VERB
|O_NONL
, " currently ");
3247 if (fmep
->wull
!= 0) {
3248 out(O_ALTFP
|O_VERB
|O_NONL
, "waiting ");
3249 ptree_timeval(O_ALTFP
|O_VERB
, &fmep
->wull
);
3250 out(O_ALTFP
|O_VERB
, NULL
);
3252 out(O_ALTFP
|O_VERB
|O_NONL
, "not waiting");
3253 out(O_ALTFP
|O_VERB
, NULL
);
3256 if (fmep
->wull
!= 0)
3257 if (wull
>= fmep
->wull
)
3258 /* New timer would fire later than established timer */
3261 if (fmep
->wull
!= 0) {
3262 fmd_timer_remove(fmep
->hdl
, fmep
->timer
);
3265 fmep
->timer
= fmd_timer_install(fmep
->hdl
, (void *)fmep
,
3267 out(O_ALTFP
|O_VERB
, "timer set, id is %ld", fmep
->timer
);
3273 fme_timer_fired(struct fme
*fmep
, id_t tid
)
3275 struct fme
*ffmep
= NULL
;
3277 for (ffmep
= FMElist
; ffmep
; ffmep
= ffmep
->next
)
3281 if (ffmep
== NULL
) {
3282 out(O_WARN
, "Timer fired for an FME (%p) not in FMEs list.",
3287 out(O_ALTFP
|O_VERB
, "Timer fired %lx", tid
);
3288 fmep
->pull
= fmep
->wull
;
3290 fmd_buf_write(fmep
->hdl
, fmep
->fmcase
,
3291 WOBUF_PULL
, (void *)&fmep
->pull
, sizeof (fmep
->pull
));
3293 fme_eval(fmep
, fmep
->e0r
);
3297 * Preserve the fme's suspect list in its psuspects list, NULLing the
3298 * suspects list in the meantime.
3301 save_suspects(struct fme
*fmep
)
3304 struct event
*nextep
;
3306 /* zero out the previous suspect list */
3307 for (ep
= fmep
->psuspects
; ep
; ep
= nextep
) {
3308 nextep
= ep
->psuspects
;
3309 ep
->psuspects
= NULL
;
3311 fmep
->psuspects
= NULL
;
3313 /* zero out the suspect list, copying it to previous suspect list */
3314 fmep
->psuspects
= fmep
->suspects
;
3315 for (ep
= fmep
->suspects
; ep
; ep
= nextep
) {
3316 nextep
= ep
->suspects
;
3317 ep
->psuspects
= ep
->suspects
;
3318 ep
->suspects
= NULL
;
3321 fmep
->suspects
= NULL
;
3322 fmep
->nsuspects
= 0;
3326 * Retrieve the fme's suspect list from its psuspects list.
3329 restore_suspects(struct fme
*fmep
)
3332 struct event
*nextep
;
3334 fmep
->nsuspects
= 0;
3335 fmep
->suspects
= fmep
->psuspects
;
3336 for (ep
= fmep
->psuspects
; ep
; ep
= nextep
) {
3338 nextep
= ep
->psuspects
;
3339 ep
->suspects
= ep
->psuspects
;
3344 * this is what we use to call the Emrys prototype code instead of main()
3347 fme_eval(struct fme
*fmep
, fmd_event_t
*ffep
)
3350 unsigned long long my_delay
= TIMEVAL_EVENTUALLY
;
3351 struct rsl
*srl
= NULL
;
3352 struct rsl
*srl2
= NULL
;
3353 int mess_zero_count
;
3356 save_suspects(fmep
);
3358 out(O_ALTFP
, "Evaluate FME %d", fmep
->id
);
3361 lut_walk(fmep
->eventtree
, (lut_cb
)clear_arrows
, (void *)fmep
);
3362 fmep
->state
= hypothesise(fmep
, fmep
->e0
, fmep
->ull
, &my_delay
);
3364 out(O_ALTFP
|O_NONL
, "FME%d state: %s, suspect list:", fmep
->id
,
3365 fme_state2str(fmep
->state
));
3366 for (ep
= fmep
->suspects
; ep
; ep
= ep
->suspects
) {
3367 out(O_ALTFP
|O_NONL
, " ");
3368 itree_pevent_brief(O_ALTFP
|O_NONL
, ep
);
3372 switch (fmep
->state
) {
3374 print_suspects(SLNEW
, fmep
);
3375 (void) upsets_eval(fmep
, ffep
);
3378 * we may have already posted suspects in upsets_eval() which
3379 * can recurse into fme_eval() again. If so then just return.
3381 if (fmep
->posted_suspects
)
3384 stats_counter_bump(fmep
->diags
);
3385 rpcnt
= fmep
->nsuspects
;
3386 save_suspects(fmep
);
3389 * create two lists, one for "message=1" faults and one for
3390 * "message=0" faults. If we have a mixture we will generate
3391 * two separate suspect lists.
3393 srl
= MALLOC(rpcnt
* sizeof (struct rsl
));
3394 bzero(srl
, rpcnt
* sizeof (struct rsl
));
3395 srl2
= MALLOC(rpcnt
* sizeof (struct rsl
));
3396 bzero(srl2
, rpcnt
* sizeof (struct rsl
));
3397 mess_zero_count
= trim_suspects(fmep
, srl
, srl2
, ffep
);
3400 * If the resulting suspect list has no members, we're
3401 * done so simply close the case. Otherwise sort and publish.
3403 if (fmep
->nsuspects
== 0 && mess_zero_count
== 0) {
3405 "[FME%d, case %s (all suspects are upsets)]",
3406 fmep
->id
, fmd_case_uuid(fmep
->hdl
, fmep
->fmcase
));
3407 fmd_case_close(fmep
->hdl
, fmep
->fmcase
);
3408 } else if (fmep
->nsuspects
!= 0 && mess_zero_count
== 0) {
3409 publish_suspects(fmep
, srl
);
3410 out(O_ALTFP
, "[solving FME%d, case %s]", fmep
->id
,
3411 fmd_case_uuid(fmep
->hdl
, fmep
->fmcase
));
3412 fmd_case_solve(fmep
->hdl
, fmep
->fmcase
);
3413 } else if (fmep
->nsuspects
== 0 && mess_zero_count
!= 0) {
3414 fmep
->nsuspects
= mess_zero_count
;
3415 publish_suspects(fmep
, srl2
);
3416 out(O_ALTFP
, "[solving FME%d, case %s]", fmep
->id
,
3417 fmd_case_uuid(fmep
->hdl
, fmep
->fmcase
));
3418 fmd_case_solve(fmep
->hdl
, fmep
->fmcase
);
3423 publish_suspects(fmep
, srl
);
3424 out(O_ALTFP
, "[solving FME%d, case %s]", fmep
->id
,
3425 fmd_case_uuid(fmep
->hdl
, fmep
->fmcase
));
3426 fmd_case_solve(fmep
->hdl
, fmep
->fmcase
);
3429 * Got both message=0 and message=1 so create a
3430 * duplicate case. Also need a temporary duplicate fme
3431 * structure for use by publish_suspects().
3433 nfmep
= alloc_fme();
3434 nfmep
->id
= Nextid
++;
3435 nfmep
->hdl
= fmep
->hdl
;
3436 nfmep
->nsuspects
= mess_zero_count
;
3437 nfmep
->fmcase
= fmd_case_open(fmep
->hdl
, NULL
);
3438 out(O_ALTFP
|O_STAMP
,
3439 "[creating parallel FME%d, case %s]", nfmep
->id
,
3440 fmd_case_uuid(nfmep
->hdl
, nfmep
->fmcase
));
3443 fmd_case_setprincipal(nfmep
->hdl
,
3444 nfmep
->fmcase
, ffep
);
3445 fmd_case_add_ereport(nfmep
->hdl
,
3446 nfmep
->fmcase
, ffep
);
3448 for (obsp
= fmep
->observations
; obsp
;
3449 obsp
= obsp
->observations
)
3450 if (obsp
->ffep
&& obsp
->ffep
!= ffep
)
3451 fmd_case_add_ereport(nfmep
->hdl
,
3452 nfmep
->fmcase
, obsp
->ffep
);
3454 publish_suspects(nfmep
, srl2
);
3455 out(O_ALTFP
, "[solving FME%d, case %s]", nfmep
->id
,
3456 fmd_case_uuid(nfmep
->hdl
, nfmep
->fmcase
));
3457 fmd_case_solve(nfmep
->hdl
, nfmep
->fmcase
);
3462 restore_suspects(fmep
);
3464 fmep
->posted_suspects
= 1;
3465 fmd_buf_write(fmep
->hdl
, fmep
->fmcase
,
3467 (void *)&fmep
->posted_suspects
,
3468 sizeof (fmep
->posted_suspects
));
3471 * Now the suspects have been posted, we can clear up
3472 * the instance tree as we won't be looking at it again.
3473 * Also cancel the timer as the case is now solved.
3475 if (fmep
->wull
!= 0) {
3476 fmd_timer_remove(fmep
->hdl
, fmep
->timer
);
3482 ASSERT(my_delay
> fmep
->ull
);
3483 (void) fme_set_timer(fmep
, my_delay
);
3484 print_suspects(SLWAIT
, fmep
);
3485 itree_prune(fmep
->eventtree
);
3489 print_suspects(SLDISPROVED
, fmep
);
3490 Undiag_reason
= UD_VAL_UNSOLVD
;
3491 fme_undiagnosable(fmep
);
3495 itree_free(fmep
->eventtree
);
3496 fmep
->eventtree
= NULL
;
3497 structconfig_free(fmep
->config
);
3498 fmep
->config
= NULL
;
3499 destroy_fme_bufs(fmep
);
3502 static void indent(void);
3503 static int triggered(struct fme
*fmep
, struct event
*ep
, int mark
);
3504 static enum fme_state
effects_test(struct fme
*fmep
,
3505 struct event
*fault_event
, unsigned long long at_latest_by
,
3506 unsigned long long *pdelay
);
3507 static enum fme_state
requirements_test(struct fme
*fmep
, struct event
*ep
,
3508 unsigned long long at_latest_by
, unsigned long long *pdelay
);
3509 static enum fme_state
causes_test(struct fme
*fmep
, struct event
*ep
,
3510 unsigned long long at_latest_by
, unsigned long long *pdelay
);
3513 checkconstraints(struct fme
*fmep
, struct arrow
*arrowp
)
3515 struct constraintlist
*ctp
;
3516 struct evalue value
;
3519 if (arrowp
->forever_false
) {
3521 out(O_ALTFP
|O_VERB
|O_NONL
, " Forever false constraint: ");
3522 for (ctp
= arrowp
->constraints
; ctp
!= NULL
; ctp
= ctp
->next
) {
3523 out(O_ALTFP
|O_VERB
|O_NONL
, sep
);
3524 ptree(O_ALTFP
|O_VERB
|O_NONL
, ctp
->cnode
, 1, 0);
3527 out(O_ALTFP
|O_VERB
, NULL
);
3530 if (arrowp
->forever_true
) {
3532 out(O_ALTFP
|O_VERB
|O_NONL
, " Forever true constraint: ");
3533 for (ctp
= arrowp
->constraints
; ctp
!= NULL
; ctp
= ctp
->next
) {
3534 out(O_ALTFP
|O_VERB
|O_NONL
, sep
);
3535 ptree(O_ALTFP
|O_VERB
|O_NONL
, ctp
->cnode
, 1, 0);
3538 out(O_ALTFP
|O_VERB
, NULL
);
3542 for (ctp
= arrowp
->constraints
; ctp
!= NULL
; ctp
= ctp
->next
) {
3543 if (eval_expr(ctp
->cnode
, NULL
, NULL
,
3544 &fmep
->globals
, fmep
->config
,
3545 arrowp
, 0, &value
)) {
3546 /* evaluation successful */
3547 if (value
.t
== UNDEFINED
|| value
.v
== 0) {
3549 arrowp
->forever_false
= 1;
3551 out(O_ALTFP
|O_VERB
|O_NONL
,
3552 " False constraint: ");
3553 ptree(O_ALTFP
|O_VERB
|O_NONL
, ctp
->cnode
, 1, 0);
3554 out(O_ALTFP
|O_VERB
, NULL
);
3558 /* evaluation unsuccessful -- unknown value */
3560 out(O_ALTFP
|O_VERB
|O_NONL
,
3561 " Deferred constraint: ");
3562 ptree(O_ALTFP
|O_VERB
|O_NONL
, ctp
->cnode
, 1, 0);
3563 out(O_ALTFP
|O_VERB
, NULL
);
3568 arrowp
->forever_true
= 1;
3570 out(O_ALTFP
|O_VERB
|O_NONL
, " True constraint: ");
3571 for (ctp
= arrowp
->constraints
; ctp
!= NULL
; ctp
= ctp
->next
) {
3572 out(O_ALTFP
|O_VERB
|O_NONL
, sep
);
3573 ptree(O_ALTFP
|O_VERB
|O_NONL
, ctp
->cnode
, 1, 0);
3576 out(O_ALTFP
|O_VERB
, NULL
);
3581 triggered(struct fme
*fmep
, struct event
*ep
, int mark
)
3584 struct arrowlist
*ap
;
3587 stats_counter_bump(fmep
->Tcallcount
);
3588 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
3589 bp
= itree_next_bubble(ep
, bp
)) {
3592 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
3593 ap
= itree_next_arrow(bp
, ap
)) {
3594 /* check count of marks against K in the bubble */
3595 if ((ap
->arrowp
->mark
& mark
) &&
3596 ++count
>= bp
->nork
)
3604 mark_arrows(struct fme
*fmep
, struct event
*ep
, int mark
,
3605 unsigned long long at_latest_by
, unsigned long long *pdelay
, int keep
)
3608 struct arrowlist
*ap
;
3609 unsigned long long overall_delay
= TIMEVAL_EVENTUALLY
;
3610 unsigned long long my_delay
;
3611 enum fme_state result
;
3614 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
3615 bp
= itree_next_bubble(ep
, bp
)) {
3616 if (bp
->t
!= B_FROM
)
3618 stats_counter_bump(fmep
->Marrowcount
);
3619 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
3620 ap
= itree_next_arrow(bp
, ap
)) {
3621 struct event
*ep2
= ap
->arrowp
->head
->myevent
;
3623 * if we're clearing marks, we can avoid doing
3624 * all that work evaluating constraints.
3627 if (ap
->arrowp
->arrow_marked
== 0)
3629 ap
->arrowp
->arrow_marked
= 0;
3630 ap
->arrowp
->mark
&= ~EFFECTS_COUNTER
;
3631 if (keep
&& (ep2
->cached_state
&
3632 (WAIT_EFFECT
|CREDIBLE_EFFECT
|PARENT_WAIT
)))
3633 ep2
->keep_in_tree
= 1;
3634 ep2
->cached_state
&=
3635 ~(WAIT_EFFECT
|CREDIBLE_EFFECT
|PARENT_WAIT
);
3636 (void) mark_arrows(fmep
, ep2
, mark
, 0, NULL
,
3640 ap
->arrowp
->arrow_marked
= 1;
3641 if (ep2
->cached_state
& REQMNTS_DISPROVED
) {
3643 out(O_ALTFP
|O_VERB
|O_NONL
,
3644 " ALREADY DISPROVED ");
3645 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3646 out(O_ALTFP
|O_VERB
, NULL
);
3649 if (ep2
->cached_state
& WAIT_EFFECT
) {
3651 out(O_ALTFP
|O_VERB
|O_NONL
,
3652 " ALREADY EFFECTS WAIT ");
3653 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3654 out(O_ALTFP
|O_VERB
, NULL
);
3657 if (ep2
->cached_state
& CREDIBLE_EFFECT
) {
3659 out(O_ALTFP
|O_VERB
|O_NONL
,
3660 " ALREADY EFFECTS CREDIBLE ");
3661 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3662 out(O_ALTFP
|O_VERB
, NULL
);
3665 if ((ep2
->cached_state
& PARENT_WAIT
) &&
3666 (mark
& PARENT_WAIT
)) {
3668 out(O_ALTFP
|O_VERB
|O_NONL
,
3669 " ALREADY PARENT EFFECTS WAIT ");
3670 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3671 out(O_ALTFP
|O_VERB
, NULL
);
3674 platform_set_payloadnvp(ep2
->nvp
);
3675 if (checkconstraints(fmep
, ap
->arrowp
) == 0) {
3676 platform_set_payloadnvp(NULL
);
3678 out(O_ALTFP
|O_VERB
|O_NONL
,
3679 " CONSTRAINTS FAIL ");
3680 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3681 out(O_ALTFP
|O_VERB
, NULL
);
3684 platform_set_payloadnvp(NULL
);
3685 ap
->arrowp
->mark
|= EFFECTS_COUNTER
;
3686 if (!triggered(fmep
, ep2
, EFFECTS_COUNTER
)) {
3688 out(O_ALTFP
|O_VERB
|O_NONL
,
3689 " K-COUNT NOT YET MET ");
3690 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3691 out(O_ALTFP
|O_VERB
, NULL
);
3694 ep2
->cached_state
&= ~PARENT_WAIT
;
3696 * if we've reached an ereport and no propagation time
3697 * is specified, use the Hesitate value
3699 if (ep2
->t
== N_EREPORT
&& at_latest_by
== 0ULL &&
3700 ap
->arrowp
->maxdelay
== 0ULL) {
3701 out(O_ALTFP
|O_VERB
|O_NONL
, " default wait ");
3702 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3703 out(O_ALTFP
|O_VERB
, NULL
);
3704 result
= requirements_test(fmep
, ep2
, Hesitate
,
3707 result
= requirements_test(fmep
, ep2
,
3708 at_latest_by
+ ap
->arrowp
->maxdelay
,
3711 if (result
== FME_WAIT
) {
3712 retval
= WAIT_EFFECT
;
3713 if (overall_delay
> my_delay
)
3714 overall_delay
= my_delay
;
3715 ep2
->cached_state
|= WAIT_EFFECT
;
3717 out(O_ALTFP
|O_VERB
|O_NONL
, " EFFECTS WAIT ");
3718 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3719 out(O_ALTFP
|O_VERB
, NULL
);
3721 if (mark_arrows(fmep
, ep2
, PARENT_WAIT
,
3722 at_latest_by
, &my_delay
, 0) ==
3724 retval
= WAIT_EFFECT
;
3725 if (overall_delay
> my_delay
)
3726 overall_delay
= my_delay
;
3729 } else if (result
== FME_DISPROVED
) {
3731 out(O_ALTFP
|O_VERB
|O_NONL
,
3732 " EFFECTS DISPROVED ");
3733 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3734 out(O_ALTFP
|O_VERB
, NULL
);
3736 ep2
->cached_state
|= mark
;
3738 if (mark
== CREDIBLE_EFFECT
)
3739 out(O_ALTFP
|O_VERB
|O_NONL
,
3740 " EFFECTS CREDIBLE ");
3742 out(O_ALTFP
|O_VERB
|O_NONL
,
3743 " PARENT EFFECTS WAIT ");
3744 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep2
);
3745 out(O_ALTFP
|O_VERB
, NULL
);
3747 if (mark_arrows(fmep
, ep2
, mark
, at_latest_by
,
3748 &my_delay
, 0) == WAIT_EFFECT
) {
3749 retval
= WAIT_EFFECT
;
3750 if (overall_delay
> my_delay
)
3751 overall_delay
= my_delay
;
3757 if (retval
== WAIT_EFFECT
)
3758 *pdelay
= overall_delay
;
3762 static enum fme_state
3763 effects_test(struct fme
*fmep
, struct event
*fault_event
,
3764 unsigned long long at_latest_by
, unsigned long long *pdelay
)
3766 struct event
*error_event
;
3767 enum fme_state return_value
= FME_CREDIBLE
;
3768 unsigned long long overall_delay
= TIMEVAL_EVENTUALLY
;
3769 unsigned long long my_delay
;
3771 stats_counter_bump(fmep
->Ecallcount
);
3774 out(O_ALTFP
|O_VERB
|O_NONL
, "->");
3775 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, fault_event
);
3776 out(O_ALTFP
|O_VERB
, NULL
);
3778 if (mark_arrows(fmep
, fault_event
, CREDIBLE_EFFECT
, at_latest_by
,
3779 &my_delay
, 0) == WAIT_EFFECT
) {
3780 return_value
= FME_WAIT
;
3781 if (overall_delay
> my_delay
)
3782 overall_delay
= my_delay
;
3784 for (error_event
= fmep
->observations
;
3785 error_event
; error_event
= error_event
->observations
) {
3787 out(O_ALTFP
|O_VERB
|O_NONL
, " ");
3788 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, error_event
);
3789 if (!(error_event
->cached_state
& CREDIBLE_EFFECT
)) {
3790 if (error_event
->cached_state
&
3791 (PARENT_WAIT
|WAIT_EFFECT
)) {
3792 out(O_ALTFP
|O_VERB
, " NOT YET triggered");
3795 return_value
= FME_DISPROVED
;
3796 out(O_ALTFP
|O_VERB
, " NOT triggered");
3799 out(O_ALTFP
|O_VERB
, " triggered");
3802 if (return_value
== FME_DISPROVED
) {
3803 (void) mark_arrows(fmep
, fault_event
, 0, 0, NULL
, 0);
3805 fault_event
->keep_in_tree
= 1;
3806 (void) mark_arrows(fmep
, fault_event
, 0, 0, NULL
, 1);
3810 out(O_ALTFP
|O_VERB
|O_NONL
, "<-EFFECTS %s ",
3811 fme_state2str(return_value
));
3812 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, fault_event
);
3813 out(O_ALTFP
|O_VERB
, NULL
);
3815 if (return_value
== FME_WAIT
)
3816 *pdelay
= overall_delay
;
3817 return (return_value
);
3820 static enum fme_state
3821 requirements_test(struct fme
*fmep
, struct event
*ep
,
3822 unsigned long long at_latest_by
, unsigned long long *pdelay
)
3825 int credible_events
;
3826 int deferred_events
;
3827 enum fme_state return_value
= FME_CREDIBLE
;
3828 unsigned long long overall_delay
= TIMEVAL_EVENTUALLY
;
3829 unsigned long long arrow_delay
;
3830 unsigned long long my_delay
;
3833 struct arrowlist
*ap
;
3835 if (ep
->cached_state
& REQMNTS_CREDIBLE
) {
3837 out(O_ALTFP
|O_VERB
|O_NONL
, " REQMNTS ALREADY CREDIBLE ");
3838 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3839 out(O_ALTFP
|O_VERB
, NULL
);
3840 return (FME_CREDIBLE
);
3842 if (ep
->cached_state
& REQMNTS_DISPROVED
) {
3844 out(O_ALTFP
|O_VERB
|O_NONL
, " REQMNTS ALREADY DISPROVED ");
3845 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3846 out(O_ALTFP
|O_VERB
, NULL
);
3847 return (FME_DISPROVED
);
3849 if (ep
->cached_state
& REQMNTS_WAIT
) {
3851 *pdelay
= ep
->cached_delay
;
3852 out(O_ALTFP
|O_VERB
|O_NONL
, " REQMNTS ALREADY WAIT ");
3853 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3854 out(O_ALTFP
|O_VERB
|O_NONL
, ", wait for: ");
3855 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &at_latest_by
);
3856 out(O_ALTFP
|O_VERB
, NULL
);
3859 stats_counter_bump(fmep
->Rcallcount
);
3862 out(O_ALTFP
|O_VERB
|O_NONL
, "->");
3863 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3864 out(O_ALTFP
|O_VERB
|O_NONL
, ", at latest by: ");
3865 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &at_latest_by
);
3866 out(O_ALTFP
|O_VERB
, NULL
);
3868 if (ep
->t
== N_EREPORT
) {
3869 if (ep
->count
== 0) {
3870 if (fmep
->pull
>= at_latest_by
) {
3871 return_value
= FME_DISPROVED
;
3873 ep
->cached_delay
= *pdelay
= at_latest_by
;
3874 return_value
= FME_WAIT
;
3879 switch (return_value
) {
3881 ep
->cached_state
|= REQMNTS_CREDIBLE
;
3882 out(O_ALTFP
|O_VERB
|O_NONL
, "<-REQMNTS CREDIBLE ");
3883 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3886 ep
->cached_state
|= REQMNTS_DISPROVED
;
3887 out(O_ALTFP
|O_VERB
|O_NONL
, "<-REQMNTS DISPROVED ");
3888 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3891 ep
->cached_state
|= REQMNTS_WAIT
;
3892 out(O_ALTFP
|O_VERB
|O_NONL
, "<-REQMNTS WAIT ");
3893 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
3894 out(O_ALTFP
|O_VERB
|O_NONL
, " to ");
3895 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &at_latest_by
);
3898 out(O_DIE
, "requirements_test: unexpected fme_state");
3901 out(O_ALTFP
|O_VERB
, NULL
);
3904 return (return_value
);
3907 /* this event is not a report, descend the tree */
3908 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
3909 bp
= itree_next_bubble(ep
, bp
)) {
3912 if (bp
->t
!= B_FROM
)
3917 credible_events
= 0;
3919 deferred_events
= 0;
3920 arrow_delay
= TIMEVAL_EVENTUALLY
;
3922 * n is -1 for 'A' so adjust it.
3923 * XXX just count up the arrows for now.
3927 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
3928 ap
= itree_next_arrow(bp
, ap
))
3931 out(O_ALTFP
|O_VERB
, " Bubble Counted N=%d", n
);
3934 out(O_ALTFP
|O_VERB
, " Bubble N=%d", n
);
3939 if (!(bp
->mark
& (BUBBLE_ELIDED
|BUBBLE_OK
))) {
3940 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
3941 ap
= itree_next_arrow(bp
, ap
)) {
3942 ep2
= ap
->arrowp
->head
->myevent
;
3943 platform_set_payloadnvp(ep2
->nvp
);
3944 (void) checkconstraints(fmep
, ap
->arrowp
);
3945 if (!ap
->arrowp
->forever_false
) {
3947 * if all arrows are invalidated by the
3948 * constraints, then we should elide the
3949 * whole bubble to be consistant with
3950 * the tree creation time behaviour
3952 bp
->mark
|= BUBBLE_OK
;
3953 platform_set_payloadnvp(NULL
);
3956 platform_set_payloadnvp(NULL
);
3959 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
3960 ap
= itree_next_arrow(bp
, ap
)) {
3961 ep2
= ap
->arrowp
->head
->myevent
;
3962 if (n
<= credible_events
)
3965 ap
->arrowp
->mark
|= REQMNTS_COUNTER
;
3966 if (triggered(fmep
, ep2
, REQMNTS_COUNTER
))
3967 /* XXX adding max timevals! */
3968 switch (requirements_test(fmep
, ep2
,
3969 at_latest_by
+ ap
->arrowp
->maxdelay
,
3980 if (my_delay
< arrow_delay
)
3981 arrow_delay
= my_delay
;
3986 "Bug in requirements_test.");
3991 if (!(bp
->mark
& BUBBLE_OK
) && waiting_events
== 0) {
3992 bp
->mark
|= BUBBLE_ELIDED
;
3996 out(O_ALTFP
|O_VERB
, " Credible: %d Waiting %d",
3997 credible_events
+ deferred_events
, waiting_events
);
3998 if (credible_events
+ deferred_events
+ waiting_events
< n
) {
3999 /* Can never meet requirements */
4000 ep
->cached_state
|= REQMNTS_DISPROVED
;
4002 out(O_ALTFP
|O_VERB
|O_NONL
, "<-REQMNTS DISPROVED ");
4003 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4004 out(O_ALTFP
|O_VERB
, NULL
);
4006 return (FME_DISPROVED
);
4008 if (credible_events
+ deferred_events
< n
) {
4009 /* will have to wait */
4010 /* wait time is shortest known */
4011 if (arrow_delay
< overall_delay
)
4012 overall_delay
= arrow_delay
;
4013 return_value
= FME_WAIT
;
4014 } else if (credible_events
< n
) {
4015 if (return_value
!= FME_WAIT
)
4016 return_value
= FME_DEFERRED
;
4021 * don't mark as FME_DEFERRED. If this event isn't reached by another
4022 * path, then this will be considered FME_CREDIBLE. But if it is
4023 * reached by a different path so the K-count is met, then might
4024 * get overridden by FME_WAIT or FME_DISPROVED.
4026 if (return_value
== FME_WAIT
) {
4027 ep
->cached_state
|= REQMNTS_WAIT
;
4028 ep
->cached_delay
= *pdelay
= overall_delay
;
4029 } else if (return_value
== FME_CREDIBLE
) {
4030 ep
->cached_state
|= REQMNTS_CREDIBLE
;
4033 out(O_ALTFP
|O_VERB
|O_NONL
, "<-REQMNTS %s ",
4034 fme_state2str(return_value
));
4035 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4036 out(O_ALTFP
|O_VERB
, NULL
);
4038 return (return_value
);
4041 static enum fme_state
4042 causes_test(struct fme
*fmep
, struct event
*ep
,
4043 unsigned long long at_latest_by
, unsigned long long *pdelay
)
4045 unsigned long long overall_delay
= TIMEVAL_EVENTUALLY
;
4046 unsigned long long my_delay
;
4047 int credible_results
= 0;
4048 int waiting_results
= 0;
4049 enum fme_state fstate
;
4050 struct event
*tail_event
;
4052 struct arrowlist
*ap
;
4055 stats_counter_bump(fmep
->Ccallcount
);
4058 out(O_ALTFP
|O_VERB
|O_NONL
, "->");
4059 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4060 out(O_ALTFP
|O_VERB
, NULL
);
4062 for (bp
= itree_next_bubble(ep
, NULL
); bp
;
4063 bp
= itree_next_bubble(ep
, bp
)) {
4066 k
= bp
->nork
; /* remember the K value */
4067 for (ap
= itree_next_arrow(bp
, NULL
); ap
;
4068 ap
= itree_next_arrow(bp
, ap
)) {
4069 int do_not_follow
= 0;
4072 * if we get to the same event multiple times
4073 * only worry about the first one.
4075 if (ap
->arrowp
->tail
->myevent
->cached_state
&
4078 out(O_ALTFP
|O_VERB
|O_NONL
,
4079 " causes test already run for ");
4080 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
,
4081 ap
->arrowp
->tail
->myevent
);
4082 out(O_ALTFP
|O_VERB
, NULL
);
4087 * see if false constraint prevents us
4088 * from traversing this arrow
4090 platform_set_payloadnvp(ep
->nvp
);
4091 if (checkconstraints(fmep
, ap
->arrowp
) == 0)
4093 platform_set_payloadnvp(NULL
);
4094 if (do_not_follow
) {
4096 out(O_ALTFP
|O_VERB
|O_NONL
,
4097 " False arrow from ");
4098 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
,
4099 ap
->arrowp
->tail
->myevent
);
4100 out(O_ALTFP
|O_VERB
, NULL
);
4104 ap
->arrowp
->tail
->myevent
->cached_state
|=
4106 tail_event
= ap
->arrowp
->tail
->myevent
;
4107 fstate
= hypothesise(fmep
, tail_event
, at_latest_by
,
4112 if (my_delay
< overall_delay
)
4113 overall_delay
= my_delay
;
4122 out(O_DIE
, "Bug in causes_test");
4126 /* compare against K */
4127 if (credible_results
+ waiting_results
< k
) {
4129 out(O_ALTFP
|O_VERB
|O_NONL
, "<-CAUSES DISPROVED ");
4130 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4131 out(O_ALTFP
|O_VERB
, NULL
);
4133 return (FME_DISPROVED
);
4135 if (waiting_results
!= 0) {
4136 *pdelay
= overall_delay
;
4138 out(O_ALTFP
|O_VERB
|O_NONL
, "<-CAUSES WAIT ");
4139 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4140 out(O_ALTFP
|O_VERB
|O_NONL
, " to ");
4141 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &at_latest_by
);
4142 out(O_ALTFP
|O_VERB
, NULL
);
4147 out(O_ALTFP
|O_VERB
|O_NONL
, "<-CAUSES CREDIBLE ");
4148 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4149 out(O_ALTFP
|O_VERB
, NULL
);
4151 return (FME_CREDIBLE
);
4154 static enum fme_state
4155 hypothesise(struct fme
*fmep
, struct event
*ep
,
4156 unsigned long long at_latest_by
, unsigned long long *pdelay
)
4158 enum fme_state rtr
, otr
;
4159 unsigned long long my_delay
;
4160 unsigned long long overall_delay
= TIMEVAL_EVENTUALLY
;
4162 stats_counter_bump(fmep
->Hcallcount
);
4165 out(O_ALTFP
|O_VERB
|O_NONL
, "->");
4166 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4167 out(O_ALTFP
|O_VERB
|O_NONL
, ", at latest by: ");
4168 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &at_latest_by
);
4169 out(O_ALTFP
|O_VERB
, NULL
);
4171 rtr
= requirements_test(fmep
, ep
, at_latest_by
, &my_delay
);
4172 if ((rtr
== FME_WAIT
) && (my_delay
< overall_delay
))
4173 overall_delay
= my_delay
;
4174 if (rtr
!= FME_DISPROVED
) {
4175 if (is_problem(ep
->t
)) {
4176 otr
= effects_test(fmep
, ep
, at_latest_by
, &my_delay
);
4177 if (otr
!= FME_DISPROVED
) {
4178 if (fmep
->peek
== 0 && ep
->is_suspect
== 0) {
4179 ep
->suspects
= fmep
->suspects
;
4181 fmep
->suspects
= ep
;
4186 otr
= causes_test(fmep
, ep
, at_latest_by
, &my_delay
);
4187 if ((otr
== FME_WAIT
) && (my_delay
< overall_delay
))
4188 overall_delay
= my_delay
;
4189 if ((otr
!= FME_DISPROVED
) &&
4190 ((rtr
== FME_WAIT
) || (otr
== FME_WAIT
)))
4191 *pdelay
= overall_delay
;
4193 if (rtr
== FME_DISPROVED
) {
4195 out(O_ALTFP
|O_VERB
|O_NONL
, "<-DISPROVED ");
4196 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4197 out(O_ALTFP
|O_VERB
, " (doesn't meet requirements)");
4199 return (FME_DISPROVED
);
4201 if ((otr
== FME_DISPROVED
) && is_problem(ep
->t
)) {
4203 out(O_ALTFP
|O_VERB
|O_NONL
, "<-DISPROVED ");
4204 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4205 out(O_ALTFP
|O_VERB
, " (doesn't explain all reports)");
4207 return (FME_DISPROVED
);
4209 if (otr
== FME_DISPROVED
) {
4211 out(O_ALTFP
|O_VERB
|O_NONL
, "<-DISPROVED ");
4212 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4213 out(O_ALTFP
|O_VERB
, " (causes are not credible)");
4215 return (FME_DISPROVED
);
4217 if ((rtr
== FME_WAIT
) || (otr
== FME_WAIT
)) {
4219 out(O_ALTFP
|O_VERB
|O_NONL
, "<-WAIT ");
4220 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4221 out(O_ALTFP
|O_VERB
|O_NONL
, " to ");
4222 ptree_timeval(O_ALTFP
|O_VERB
|O_NONL
, &overall_delay
);
4223 out(O_ALTFP
|O_VERB
, NULL
);
4228 out(O_ALTFP
|O_VERB
|O_NONL
, "<-CREDIBLE ");
4229 itree_pevent_brief(O_ALTFP
|O_VERB
|O_NONL
, ep
);
4230 out(O_ALTFP
|O_VERB
, NULL
);
4232 return (FME_CREDIBLE
);
4236 * fme_istat_load -- reconstitute any persistent istats
4239 fme_istat_load(fmd_hdl_t
*hdl
)
4245 if ((sz
= fmd_buf_size(hdl
, NULL
, WOBUF_ISTATS
)) == 0) {
4246 out(O_ALTFP
, "fme_istat_load: No stats");
4252 fmd_buf_read(hdl
, NULL
, WOBUF_ISTATS
, sbuf
, sz
);
4255 * pick apart the serialized stats
4258 * <class-name>, '@', <path>, '\0', <value>, '\0'
4260 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
4262 * since this is parsing our own serialized data, any parsing issues
4263 * are fatal, so we check for them all with ASSERT() below.
4266 while (ptr
< &sbuf
[sz
]) {
4271 sepptr
= strchr(ptr
, '@');
4272 ASSERT(sepptr
!= NULL
);
4275 /* construct the event */
4276 np
= newnode(T_EVENT
, NULL
, 0);
4277 np
->u
.event
.ename
= newnode(T_NAME
, NULL
, 0);
4278 np
->u
.event
.ename
->u
.name
.t
= N_STAT
;
4279 np
->u
.event
.ename
->u
.name
.s
= stable(ptr
);
4280 np
->u
.event
.ename
->u
.name
.it
= IT_ENAME
;
4281 np
->u
.event
.ename
->u
.name
.last
= np
->u
.event
.ename
;
4284 ASSERT(ptr
< &sbuf
[sz
]);
4286 ptr
++; /* move past the '\0' separating path from value */
4287 ASSERT(ptr
< &sbuf
[sz
]);
4288 ASSERT(isdigit(*ptr
));
4292 ptr
++; /* move past the final '\0' for this entry */
4294 np
->u
.event
.epname
= pathstring2epnamenp(sepptr
+ 1);
4295 ASSERT(np
->u
.event
.epname
!= NULL
);
4297 istat_bump(np
, val
);