sys/kern/kern_tc.c

   1 /* $NetBSD: kern_tc.c,v 1.39 2009/05/23 17:08:04 ad Exp $ */
   2
   3 /*-
   4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*-
  33  * ----------------------------------------------------------------------------
  34  * "THE BEER-WARE LICENSE" (Revision 42):
  35  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  36  * can do whatever you want with this stuff. If we meet some day, and you think
  37  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  38  * ---------------------------------------------------------------------------
  39  */
  40
  41 #include <sys/cdefs.h>
  42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */
  43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.39 2009/05/23 17:08:04 ad Exp $");
  44
  45 #include "opt_ntp.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/kernel.h>
  49 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */
  50 #include <sys/sysctl.h>
  51 #include <sys/syslog.h>
  52 #include <sys/systm.h>
  53 #include <sys/timepps.h>
  54 #include <sys/timetc.h>
  55 #include <sys/timex.h>
  56 #include <sys/evcnt.h>
  57 #include <sys/kauth.h>
  58 #include <sys/mutex.h>
  59 #include <sys/atomic.h>
  60 #include <sys/xcall.h>
  61
  62 /*
  63  * A large step happens on boot.  This constant detects such steps.
  64  * It is relatively small so that ntp_update_second gets called enough
  65  * in the typical 'missed a couple of seconds' case, but doesn't loop
  66  * forever when the time step is large.
  67  */
  68 #define LARGE_STEP      200
  69
  70 /*
  71  * Implement a dummy timecounter which we can use until we get a real one
  72  * in the air.  This allows the console and other early stuff to use
  73  * time services.
  74  */
  75
  76 static u_int
  77 dummy_get_timecount(struct timecounter *tc)
  78 {
  79         static u_int now;
  80
  81         return (++now);
  82 }
  83
  84 static struct timecounter dummy_timecounter = {
  85         dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000, NULL, NULL,
  86 };
  87
  88 struct timehands {
  89         /* These fields must be initialized by the driver. */
  90         struct timecounter      *th_counter;     /* active timecounter */
  91         int64_t                 th_adjustment;   /* frequency adjustment */
  92                                                  /* (NTP/adjtime) */
  93         u_int64_t               th_scale;        /* scale factor (counter */
  94                                                  /* tick->time) */
  95         u_int64_t               th_offset_count; /* offset at last time */
  96                                                  /* update (tc_windup()) */
  97         struct bintime          th_offset;       /* bin (up)time at windup */
  98         struct timeval          th_microtime;    /* cached microtime */
  99         struct timespec         th_nanotime;     /* cached nanotime */
 100         /* Fields not to be copied in tc_windup start with th_generation. */
 101         volatile u_int          th_generation;   /* current genration */
 102         struct timehands        *th_next;        /* next timehand */
 103 };
 104
 105 static struct timehands th0;
 106 static struct timehands th9 = { .th_next = &th0, };
 107 static struct timehands th8 = { .th_next = &th9, };
 108 static struct timehands th7 = { .th_next = &th8, };
 109 static struct timehands th6 = { .th_next = &th7, };
 110 static struct timehands th5 = { .th_next = &th6, };
 111 static struct timehands th4 = { .th_next = &th5, };
 112 static struct timehands th3 = { .th_next = &th4, };
 113 static struct timehands th2 = { .th_next = &th3, };
 114 static struct timehands th1 = { .th_next = &th2, };
 115 static struct timehands th0 = {
 116         .th_counter = &dummy_timecounter,
 117         .th_scale = (uint64_t)-1 / 1000000,
 118         .th_offset = { .sec = 1, .frac = 0 },
 119         .th_generation = 1,
 120         .th_next = &th1,
 121 };
 122
 123 static struct timehands *volatile timehands = &th0;
 124 struct timecounter *timecounter = &dummy_timecounter;
 125 static struct timecounter *timecounters = &dummy_timecounter;
 126
 127 time_t time_second = 1;
 128 time_t time_uptime = 1;
 129
 130 static struct bintime timebasebin;
 131
 132 static int timestepwarnings;
 133
 134 kmutex_t timecounter_lock;
 135 static u_int timecounter_mods;
 136 static volatile int timecounter_removals = 1;
 137 static u_int timecounter_bad;
 138
 139 #ifdef __FreeBSD__
 140 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
 141     &timestepwarnings, 0, "");
 142 #endif /* __FreeBSD__ */
 143
 144 /*
 145  * sysctl helper routine for kern.timercounter.hardware
 146  */
 147 static int
 148 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS)
 149 {
 150         struct sysctlnode node;
 151         int error;
 152         char newname[MAX_TCNAMELEN];
 153         struct timecounter *newtc, *tc;
 154
 155         tc = timecounter;
 156
 157         strlcpy(newname, tc->tc_name, sizeof(newname));
 158
 159         node = *rnode;
 160         node.sysctl_data = newname;
 161         node.sysctl_size = sizeof(newname);
 162
 163         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 164
 165         if (error ||
 166             newp == NULL ||
 167             strncmp(newname, tc->tc_name, sizeof(newname)) == 0)
 168                 return error;
 169
 170         if (l != NULL && (error = kauth_authorize_system(l->l_cred,
 171             KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname,
 172             NULL, NULL)) != 0)
 173                 return (error);
 174
 175         if (!cold)
 176                 mutex_spin_enter(&timecounter_lock);
 177         error = EINVAL;
 178         for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
 179                 if (strcmp(newname, newtc->tc_name) != 0)
 180                         continue;
 181                 /* Warm up new timecounter. */
 182                 (void)newtc->tc_get_timecount(newtc);
 183                 (void)newtc->tc_get_timecount(newtc);
 184                 timecounter = newtc;
 185                 error = 0;
 186                 break;
 187         }
 188         if (!cold)
 189                 mutex_spin_exit(&timecounter_lock);
 190         return error;
 191 }
 192
 193 static int
 194 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS)
 195 {
 196         char buf[MAX_TCNAMELEN+48];
 197         char *where;
 198         const char *spc;
 199         struct timecounter *tc;
 200         size_t needed, left, slen;
 201         int error, mods;
 202
 203         if (newp != NULL)
 204                 return (EPERM);
 205         if (namelen != 0)
 206                 return (EINVAL);
 207
 208         mutex_spin_enter(&timecounter_lock);
 209  retry:
 210         spc = "";
 211         error = 0;
 212         needed = 0;
 213         left = *oldlenp;
 214         where = oldp;
 215         for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
 216                 if (where == NULL) {
 217                         needed += sizeof(buf);  /* be conservative */
 218                 } else {
 219                         slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64
 220                                         " Hz)", spc, tc->tc_name, tc->tc_quality,
 221                                         tc->tc_frequency);
 222                         if (left < slen + 1)
 223                                 break;
 224                         mods = timecounter_mods;
 225                         mutex_spin_exit(&timecounter_lock);
 226                         error = copyout(buf, where, slen + 1);
 227                         mutex_spin_enter(&timecounter_lock);
 228                         if (mods != timecounter_mods) {
 229                                 goto retry;
 230                         }
 231                         spc = " ";
 232                         where += slen;
 233                         needed += slen;
 234                         left -= slen;
 235                 }
 236         }
 237         mutex_spin_exit(&timecounter_lock);
 238
 239         *oldlenp = needed;
 240         return (error);
 241 }
 242
 243 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup")
 244 {
 245         const struct sysctlnode *node;
 246
 247         sysctl_createv(clog, 0, NULL, &node,
 248                        CTLFLAG_PERMANENT,
 249                        CTLTYPE_NODE, "timecounter",
 250                        SYSCTL_DESCR("time counter information"),
 251                        NULL, 0, NULL, 0,
 252                        CTL_KERN, CTL_CREATE, CTL_EOL);
 253
 254         if (node != NULL) {
 255                 sysctl_createv(clog, 0, NULL, NULL,
 256                                CTLFLAG_PERMANENT,
 257                                CTLTYPE_STRING, "choice",
 258                                SYSCTL_DESCR("available counters"),
 259                                sysctl_kern_timecounter_choice, 0, NULL, 0,
 260                                CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
 261
 262                 sysctl_createv(clog, 0, NULL, NULL,
 263                                CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 264                                CTLTYPE_STRING, "hardware",
 265                                SYSCTL_DESCR("currently active time counter"),
 266                                sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN,
 267                                CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
 268
 269                 sysctl_createv(clog, 0, NULL, NULL,
 270                                CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 271                                CTLTYPE_INT, "timestepwarnings",
 272                                SYSCTL_DESCR("log time steps"),
 273                                NULL, 0, &timestepwarnings, 0,
 274                                CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
 275         }
 276 }
 277
 278 #ifdef TC_COUNTERS
 279 #define TC_STATS(name)                                                  \
 280 static struct evcnt n##name =                                           \
 281     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name);     \
 282 EVCNT_ATTACH_STATIC(n##name)
 283 TC_STATS(binuptime);    TC_STATS(nanouptime);    TC_STATS(microuptime);
 284 TC_STATS(bintime);      TC_STATS(nanotime);      TC_STATS(microtime);
 285 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime);
 286 TC_STATS(getbintime);   TC_STATS(getnanotime);   TC_STATS(getmicrotime);
 287 TC_STATS(setclock);
 288 #define TC_COUNT(var)   var.ev_count++
 289 #undef TC_STATS
 290 #else
 291 #define TC_COUNT(var)   /* nothing */
 292 #endif  /* TC_COUNTERS */
 293
 294 static void tc_windup(void);
 295
 296 /*
 297  * Return the difference between the timehands' counter value now and what
 298  * was when we copied it to the timehands' offset_count.
 299  */
 300 static __inline u_int
 301 tc_delta(struct timehands *th)
 302 {
 303         struct timecounter *tc;
 304
 305         tc = th->th_counter;
 306         return ((tc->tc_get_timecount(tc) -
 307                  th->th_offset_count) & tc->tc_counter_mask);
 308 }
 309
 310 /*
 311  * Functions for reading the time.  We have to loop until we are sure that
 312  * the timehands that we operated on was not updated under our feet.  See
 313  * the comment in <sys/timevar.h> for a description of these 12 functions.
 314  */
 315
 316 void
 317 binuptime(struct bintime *bt)
 318 {
 319         struct timehands *th;
 320         lwp_t *l;
 321         u_int lgen, gen;
 322
 323         TC_COUNT(nbinuptime);
 324
 325         /*
 326          * Provide exclusion against tc_detach().
 327          *
 328          * We record the number of timecounter removals before accessing
 329          * timecounter state.  Note that the LWP can be using multiple
 330          * "generations" at once, due to interrupts (interrupted while in
 331          * this function).  Hardware interrupts will borrow the interrupted
 332          * LWP's l_tcgen value for this purpose, and can themselves be
 333          * interrupted by higher priority interrupts.  In this case we need
 334          * to ensure that the oldest generation in use is recorded.
 335          *
 336          * splsched() is too expensive to use, so we take care to structure
 337          * this code in such a way that it is not required.  Likewise, we
 338          * do not disable preemption.
 339          *
 340          * Memory barriers are also too expensive to use for such a
 341          * performance critical function.  The good news is that we do not
 342          * need memory barriers for this type of exclusion, as the thread
 343          * updating timecounter_removals will issue a broadcast cross call
 344          * before inspecting our l_tcgen value (this elides memory ordering
 345          * issues).
 346          */
 347         l = curlwp;
 348         lgen = l->l_tcgen;
 349         if (__predict_true(lgen == 0)) {
 350                 l->l_tcgen = timecounter_removals;
 351         }
 352         __insn_barrier();
 353
 354         do {
 355                 th = timehands;
 356                 gen = th->th_generation;
 357                 *bt = th->th_offset;
 358                 bintime_addx(bt, th->th_scale * tc_delta(th));
 359         } while (gen == 0 || gen != th->th_generation);
 360
 361         __insn_barrier();
 362         l->l_tcgen = lgen;
 363 }
 364
 365 void
 366 nanouptime(struct timespec *tsp)
 367 {
 368         struct bintime bt;
 369
 370         TC_COUNT(nnanouptime);
 371         binuptime(&bt);
 372         bintime2timespec(&bt, tsp);
 373 }
 374
 375 void
 376 microuptime(struct timeval *tvp)
 377 {
 378         struct bintime bt;
 379
 380         TC_COUNT(nmicrouptime);
 381         binuptime(&bt);
 382         bintime2timeval(&bt, tvp);
 383 }
 384
 385 void
 386 bintime(struct bintime *bt)
 387 {
 388
 389         TC_COUNT(nbintime);
 390         binuptime(bt);
 391         bintime_add(bt, &timebasebin);
 392 }
 393
 394 void
 395 nanotime(struct timespec *tsp)
 396 {
 397         struct bintime bt;
 398
 399         TC_COUNT(nnanotime);
 400         bintime(&bt);
 401         bintime2timespec(&bt, tsp);
 402 }
 403
 404 void
 405 microtime(struct timeval *tvp)
 406 {
 407         struct bintime bt;
 408
 409         TC_COUNT(nmicrotime);
 410         bintime(&bt);
 411         bintime2timeval(&bt, tvp);
 412 }
 413
 414 void
 415 getbinuptime(struct bintime *bt)
 416 {
 417         struct timehands *th;
 418         u_int gen;
 419
 420         TC_COUNT(ngetbinuptime);
 421         do {
 422                 th = timehands;
 423                 gen = th->th_generation;
 424                 *bt = th->th_offset;
 425         } while (gen == 0 || gen != th->th_generation);
 426 }
 427
 428 void
 429 getnanouptime(struct timespec *tsp)
 430 {
 431         struct timehands *th;
 432         u_int gen;
 433
 434         TC_COUNT(ngetnanouptime);
 435         do {
 436                 th = timehands;
 437                 gen = th->th_generation;
 438                 bintime2timespec(&th->th_offset, tsp);
 439         } while (gen == 0 || gen != th->th_generation);
 440 }
 441
 442 void
 443 getmicrouptime(struct timeval *tvp)
 444 {
 445         struct timehands *th;
 446         u_int gen;
 447
 448         TC_COUNT(ngetmicrouptime);
 449         do {
 450                 th = timehands;
 451                 gen = th->th_generation;
 452                 bintime2timeval(&th->th_offset, tvp);
 453         } while (gen == 0 || gen != th->th_generation);
 454 }
 455
 456 void
 457 getbintime(struct bintime *bt)
 458 {
 459         struct timehands *th;
 460         u_int gen;
 461
 462         TC_COUNT(ngetbintime);
 463         do {
 464                 th = timehands;
 465                 gen = th->th_generation;
 466                 *bt = th->th_offset;
 467         } while (gen == 0 || gen != th->th_generation);
 468         bintime_add(bt, &timebasebin);
 469 }
 470
 471 void
 472 getnanotime(struct timespec *tsp)
 473 {
 474         struct timehands *th;
 475         u_int gen;
 476
 477         TC_COUNT(ngetnanotime);
 478         do {
 479                 th = timehands;
 480                 gen = th->th_generation;
 481                 *tsp = th->th_nanotime;
 482         } while (gen == 0 || gen != th->th_generation);
 483 }
 484
 485 void
 486 getmicrotime(struct timeval *tvp)
 487 {
 488         struct timehands *th;
 489         u_int gen;
 490
 491         TC_COUNT(ngetmicrotime);
 492         do {
 493                 th = timehands;
 494                 gen = th->th_generation;
 495                 *tvp = th->th_microtime;
 496         } while (gen == 0 || gen != th->th_generation);
 497 }
 498
 499 /*
 500  * Initialize a new timecounter and possibly use it.
 501  */
 502 void
 503 tc_init(struct timecounter *tc)
 504 {
 505         u_int u;
 506
 507         u = tc->tc_frequency / tc->tc_counter_mask;
 508         /* XXX: We need some margin here, 10% is a guess */
 509         u *= 11;
 510         u /= 10;
 511         if (u > hz && tc->tc_quality >= 0) {
 512                 tc->tc_quality = -2000;
 513                 aprint_verbose(
 514                     "timecounter: Timecounter \"%s\" frequency %ju Hz",
 515                             tc->tc_name, (uintmax_t)tc->tc_frequency);
 516                 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u);
 517         } else if (tc->tc_quality >= 0 || bootverbose) {
 518                 aprint_verbose(
 519                     "timecounter: Timecounter \"%s\" frequency %ju Hz "
 520                     "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency,
 521                     tc->tc_quality);
 522         }
 523
 524         mutex_spin_enter(&timecounter_lock);
 525         tc->tc_next = timecounters;
 526         timecounters = tc;
 527         timecounter_mods++;
 528         /*
 529          * Never automatically use a timecounter with negative quality.
 530          * Even though we run on the dummy counter, switching here may be
 531          * worse since this timecounter may not be monotonous.
 532          */
 533         if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality ||
 534             (tc->tc_quality == timecounter->tc_quality &&
 535             tc->tc_frequency > timecounter->tc_frequency))) {
 536                 (void)tc->tc_get_timecount(tc);
 537                 (void)tc->tc_get_timecount(tc);
 538                 timecounter = tc;
 539                 tc_windup();
 540         }
 541         mutex_spin_exit(&timecounter_lock);
 542 }
 543
 544 /*
 545  * Pick a new timecounter due to the existing counter going bad.
 546  */
 547 static void
 548 tc_pick(void)
 549 {
 550         struct timecounter *best, *tc;
 551
 552         KASSERT(mutex_owned(&timecounter_lock));
 553
 554         for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) {
 555                 if (tc->tc_quality > best->tc_quality)
 556                         best = tc;
 557                 else if (tc->tc_quality < best->tc_quality)
 558                         continue;
 559                 else if (tc->tc_frequency > best->tc_frequency)
 560                         best = tc;
 561         }
 562         (void)best->tc_get_timecount(best);
 563         (void)best->tc_get_timecount(best);
 564         timecounter = best;
 565 }
 566
 567 /*
 568  * A timecounter has gone bad, arrange to pick a new one at the next
 569  * clock tick.
 570  */
 571 void
 572 tc_gonebad(struct timecounter *tc)
 573 {
 574
 575         tc->tc_quality = -100;
 576         membar_producer();
 577         atomic_inc_uint(&timecounter_bad);
 578 }
 579
 580 /*
 581  * Stop using a timecounter and remove it from the timecounters list.
 582  */
 583 int
 584 tc_detach(struct timecounter *target)
 585 {
 586         struct timecounter *tc;
 587         struct timecounter **tcp = NULL;
 588         int removals;
 589         uint64_t where;
 590         lwp_t *l;
 591
 592         /* First, find the timecounter. */
 593         mutex_spin_enter(&timecounter_lock);
 594         for (tcp = &timecounters, tc = timecounters;
 595              tc != NULL;
 596              tcp = &tc->tc_next, tc = tc->tc_next) {
 597                 if (tc == target)
 598                         break;
 599         }
 600         if (tc == NULL) {
 601                 mutex_spin_exit(&timecounter_lock);
 602                 return ESRCH;
 603         }
 604
 605         /* And now, remove it. */
 606         *tcp = tc->tc_next;
 607         if (timecounter == target) {
 608                 tc_pick();
 609                 tc_windup();
 610         }
 611         timecounter_mods++;
 612         removals = timecounter_removals++;
 613         mutex_spin_exit(&timecounter_lock);
 614
 615         /*
 616          * We now have to determine if any threads in the system are still
 617          * making use of this timecounter.
 618          *
 619          * We issue a broadcast cross call to elide memory ordering issues,
 620          * then scan all LWPs in the system looking at each's timecounter
 621          * generation number.  We need to see a value of zero (not actively
 622          * using a timecounter) or a value greater than our removal value.
 623          *
 624          * We may race with threads that read `timecounter_removals' and
 625          * and then get preempted before updating `l_tcgen'.  This is not
 626          * a problem, since it means that these threads have not yet started
 627          * accessing timecounter state.  All we do need is one clean
 628          * snapshot of the system where every thread appears not to be using
 629          * old timecounter state.
 630          */
 631         for (;;) {
 632                 where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
 633                 xc_wait(where);
 634
 635                 mutex_enter(proc_lock);
 636                 LIST_FOREACH(l, &alllwp, l_list) {
 637                         if (l->l_tcgen == 0 || l->l_tcgen > removals) {
 638                                 /*
 639                                  * Not using timecounter or old timecounter
 640                                  * state at time of our xcall or later.
 641                                  */
 642                                 continue;
 643                         }
 644                         break;
 645                 }
 646                 mutex_exit(proc_lock);
 647
 648                 /*
 649                  * If the timecounter is still in use, wait at least 10ms
 650                  * before retrying.
 651                  */
 652                 if (l == NULL) {
 653                         return 0;
 654                 }
 655                 (void)kpause("tcdetach", false, mstohz(10), NULL);
 656         }
 657 }
 658
 659 /* Report the frequency of the current timecounter. */
 660 u_int64_t
 661 tc_getfrequency(void)
 662 {
 663
 664         return (timehands->th_counter->tc_frequency);
 665 }
 666
 667 /*
 668  * Step our concept of UTC.  This is done by modifying our estimate of
 669  * when we booted.
 670  */
 671 void
 672 tc_setclock(const struct timespec *ts)
 673 {
 674         struct timespec ts2;
 675         struct bintime bt, bt2;
 676
 677         mutex_spin_enter(&timecounter_lock);
 678         TC_COUNT(nsetclock);
 679         binuptime(&bt2);
 680         timespec2bintime(ts, &bt);
 681         bintime_sub(&bt, &bt2);
 682         bintime_add(&bt2, &timebasebin);
 683         timebasebin = bt;
 684         tc_windup();
 685         mutex_spin_exit(&timecounter_lock);
 686
 687         if (timestepwarnings) {
 688                 bintime2timespec(&bt2, &ts2);
 689                 log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n",
 690                     (long long)ts2.tv_sec, ts2.tv_nsec,
 691                     (long long)ts->tv_sec, ts->tv_nsec);
 692         }
 693 }
 694
 695 /*
 696  * Initialize the next struct timehands in the ring and make
 697  * it the active timehands.  Along the way we might switch to a different
 698  * timecounter and/or do seconds processing in NTP.  Slightly magic.
 699  */
 700 static void
 701 tc_windup(void)
 702 {
 703         struct bintime bt;
 704         struct timehands *th, *tho;
 705         u_int64_t scale;
 706         u_int delta, ncount, ogen;
 707         int i, s_update;
 708         time_t t;
 709
 710         KASSERT(mutex_owned(&timecounter_lock));
 711
 712         s_update = 0;
 713
 714         /*
 715          * Make the next timehands a copy of the current one, but do not
 716          * overwrite the generation or next pointer.  While we update
 717          * the contents, the generation must be zero.  Ensure global
 718          * visibility of the generation before proceeding.
 719          */
 720         tho = timehands;
 721         th = tho->th_next;
 722         ogen = th->th_generation;
 723         th->th_generation = 0;
 724         membar_producer();
 725         bcopy(tho, th, offsetof(struct timehands, th_generation));
 726
 727         /*
 728          * Capture a timecounter delta on the current timecounter and if
 729          * changing timecounters, a counter value from the new timecounter.
 730          * Update the offset fields accordingly.
 731          */
 732         delta = tc_delta(th);
 733         if (th->th_counter != timecounter)
 734                 ncount = timecounter->tc_get_timecount(timecounter);
 735         else
 736                 ncount = 0;
 737         th->th_offset_count += delta;
 738         bintime_addx(&th->th_offset, th->th_scale * delta);
 739
 740         /*
 741          * Hardware latching timecounters may not generate interrupts on
 742          * PPS events, so instead we poll them.  There is a finite risk that
 743          * the hardware might capture a count which is later than the one we
 744          * got above, and therefore possibly in the next NTP second which might
 745          * have a different rate than the current NTP second.  It doesn't
 746          * matter in practice.
 747          */
 748         if (tho->th_counter->tc_poll_pps)
 749                 tho->th_counter->tc_poll_pps(tho->th_counter);
 750
 751         /*
 752          * Deal with NTP second processing.  The for loop normally
 753          * iterates at most once, but in extreme situations it might
 754          * keep NTP sane if timeouts are not run for several seconds.
 755          * At boot, the time step can be large when the TOD hardware
 756          * has been read, so on really large steps, we call
 757          * ntp_update_second only twice.  We need to call it twice in
 758          * case we missed a leap second.
 759          * If NTP is not compiled in ntp_update_second still calculates
 760          * the adjustment resulting from adjtime() calls.
 761          */
 762         bt = th->th_offset;
 763         bintime_add(&bt, &timebasebin);
 764         i = bt.sec - tho->th_microtime.tv_sec;
 765         if (i > LARGE_STEP)
 766                 i = 2;
 767         for (; i > 0; i--) {
 768                 t = bt.sec;
 769                 ntp_update_second(&th->th_adjustment, &bt.sec);
 770                 s_update = 1;
 771                 if (bt.sec != t)
 772                         timebasebin.sec += bt.sec - t;
 773         }
 774
 775         /* Update the UTC timestamps used by the get*() functions. */
 776         /* XXX shouldn't do this here.  Should force non-`get' versions. */
 777         bintime2timeval(&bt, &th->th_microtime);
 778         bintime2timespec(&bt, &th->th_nanotime);
 779         /* Now is a good time to change timecounters. */
 780         if (th->th_counter != timecounter) {
 781                 th->th_counter = timecounter;
 782                 th->th_offset_count = ncount;
 783                 s_update = 1;
 784         }
 785
 786         /*-
 787          * Recalculate the scaling factor.  We want the number of 1/2^64
 788          * fractions of a second per period of the hardware counter, taking
 789          * into account the th_adjustment factor which the NTP PLL/adjtime(2)
 790          * processing provides us with.
 791          *
 792          * The th_adjustment is nanoseconds per second with 32 bit binary
 793          * fraction and we want 64 bit binary fraction of second:
 794          *
 795          *       x = a * 2^32 / 10^9 = a * 4.294967296
 796          *
 797          * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
 798          * we can only multiply by about 850 without overflowing, but that
 799          * leaves suitably precise fractions for multiply before divide.
 800          *
 801          * Divide before multiply with a fraction of 2199/512 results in a
 802          * systematic undercompensation of 10PPM of th_adjustment.  On a
 803          * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
 804          *
 805          * We happily sacrifice the lowest of the 64 bits of our result
 806          * to the goddess of code clarity.
 807          *
 808          */
 809         if (s_update) {
 810                 scale = (u_int64_t)1 << 63;
 811                 scale += (th->th_adjustment / 1024) * 2199;
 812                 scale /= th->th_counter->tc_frequency;
 813                 th->th_scale = scale * 2;
 814         }
 815         /*
 816          * Now that the struct timehands is again consistent, set the new
 817          * generation number, making sure to not make it zero.  Ensure
 818          * changes are globally visible before changing.
 819          */
 820         if (++ogen == 0)
 821                 ogen = 1;
 822         membar_producer();
 823         th->th_generation = ogen;
 824
 825         /*
 826          * Go live with the new struct timehands.  Ensure changes are
 827          * globally visible before changing.
 828          */
 829         time_second = th->th_microtime.tv_sec;
 830         time_uptime = th->th_offset.sec;
 831         membar_producer();
 832         timehands = th;
 833
 834         /*
 835          * Force users of the old timehand to move on.  This is
 836          * necessary for MP systems; we need to ensure that the
 837          * consumers will move away from the old timehand before
 838          * we begin updating it again when we eventually wrap
 839          * around.
 840          */
 841         if (++tho->th_generation == 0)
 842                 tho->th_generation = 1;
 843 }
 844
 845 /*
 846  * RFC 2783 PPS-API implementation.
 847  */
 848
 849 int
 850 pps_ioctl(u_long cmd, void *data, struct pps_state *pps)
 851 {
 852         pps_params_t *app;
 853         pps_info_t *pipi;
 854 #ifdef PPS_SYNC
 855         int *epi;
 856 #endif
 857
 858         KASSERT(mutex_owned(&timecounter_lock));
 859
 860         KASSERT(pps != NULL); /* XXX ("NULL pps pointer in pps_ioctl") */
 861         switch (cmd) {
 862         case PPS_IOC_CREATE:
 863                 return (0);
 864         case PPS_IOC_DESTROY:
 865                 return (0);
 866         case PPS_IOC_SETPARAMS:
 867                 app = (pps_params_t *)data;
 868                 if (app->mode & ~pps->ppscap)
 869                         return (EINVAL);
 870                 pps->ppsparam = *app;
 871                 return (0);
 872         case PPS_IOC_GETPARAMS:
 873                 app = (pps_params_t *)data;
 874                 *app = pps->ppsparam;
 875                 app->api_version = PPS_API_VERS_1;
 876                 return (0);
 877         case PPS_IOC_GETCAP:
 878                 *(int*)data = pps->ppscap;
 879                 return (0);
 880         case PPS_IOC_FETCH:
 881                 pipi = (pps_info_t *)data;
 882                 pps->ppsinfo.current_mode = pps->ppsparam.mode;
 883                 *pipi = pps->ppsinfo;
 884                 return (0);
 885         case PPS_IOC_KCBIND:
 886 #ifdef PPS_SYNC
 887                 epi = (int *)data;
 888                 /* XXX Only root should be able to do this */
 889                 if (*epi & ~pps->ppscap)
 890                         return (EINVAL);
 891                 pps->kcmode = *epi;
 892                 return (0);
 893 #else
 894                 return (EOPNOTSUPP);
 895 #endif
 896         default:
 897                 return (EPASSTHROUGH);
 898         }
 899 }
 900
 901 void
 902 pps_init(struct pps_state *pps)
 903 {
 904
 905         KASSERT(mutex_owned(&timecounter_lock));
 906
 907         pps->ppscap |= PPS_TSFMT_TSPEC;
 908         if (pps->ppscap & PPS_CAPTUREASSERT)
 909                 pps->ppscap |= PPS_OFFSETASSERT;
 910         if (pps->ppscap & PPS_CAPTURECLEAR)
 911                 pps->ppscap |= PPS_OFFSETCLEAR;
 912 }
 913
 914 void
 915 pps_capture(struct pps_state *pps)
 916 {
 917         struct timehands *th;
 918
 919         KASSERT(mutex_owned(&timecounter_lock));
 920         KASSERT(pps != NULL);
 921
 922         th = timehands;
 923         pps->capgen = th->th_generation;
 924         pps->capth = th;
 925         pps->capcount = (u_int64_t)tc_delta(th) + th->th_offset_count;
 926         if (pps->capgen != th->th_generation)
 927                 pps->capgen = 0;
 928 }
 929
 930 void
 931 pps_event(struct pps_state *pps, int event)
 932 {
 933         struct bintime bt;
 934         struct timespec ts, *tsp, *osp;
 935         u_int64_t tcount, *pcount;
 936         int foff, fhard;
 937         pps_seq_t *pseq;
 938
 939         KASSERT(mutex_owned(&timecounter_lock));
 940
 941         KASSERT(pps != NULL); /* XXX ("NULL pps pointer in pps_event") */
 942         /* If the timecounter was wound up underneath us, bail out. */
 943         if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
 944                 return;
 945
 946         /* Things would be easier with arrays. */
 947         if (event == PPS_CAPTUREASSERT) {
 948                 tsp = &pps->ppsinfo.assert_timestamp;
 949                 osp = &pps->ppsparam.assert_offset;
 950                 foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
 951                 fhard = pps->kcmode & PPS_CAPTUREASSERT;
 952                 pcount = &pps->ppscount[0];
 953                 pseq = &pps->ppsinfo.assert_sequence;
 954         } else {
 955                 tsp = &pps->ppsinfo.clear_timestamp;
 956                 osp = &pps->ppsparam.clear_offset;
 957                 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
 958                 fhard = pps->kcmode & PPS_CAPTURECLEAR;
 959                 pcount = &pps->ppscount[1];
 960                 pseq = &pps->ppsinfo.clear_sequence;
 961         }
 962
 963         /*
 964          * If the timecounter changed, we cannot compare the count values, so
 965          * we have to drop the rest of the PPS-stuff until the next event.
 966          */
 967         if (pps->ppstc != pps->capth->th_counter) {
 968                 pps->ppstc = pps->capth->th_counter;
 969                 *pcount = pps->capcount;
 970                 pps->ppscount[2] = pps->capcount;
 971                 return;
 972         }
 973
 974         /* Convert the count to a timespec. */
 975         tcount = pps->capcount - pps->capth->th_offset_count;
 976         bt = pps->capth->th_offset;
 977         bintime_addx(&bt, pps->capth->th_scale * tcount);
 978         bintime_add(&bt, &timebasebin);
 979         bintime2timespec(&bt, &ts);
 980
 981         /* If the timecounter was wound up underneath us, bail out. */
 982         if (pps->capgen != pps->capth->th_generation)
 983                 return;
 984
 985         *pcount = pps->capcount;
 986         (*pseq)++;
 987         *tsp = ts;
 988
 989         if (foff) {
 990                 timespecadd(tsp, osp, tsp);
 991                 if (tsp->tv_nsec < 0) {
 992                         tsp->tv_nsec += 1000000000;
 993                         tsp->tv_sec -= 1;
 994                 }
 995         }
 996 #ifdef PPS_SYNC
 997         if (fhard) {
 998                 u_int64_t scale;
 999
1000                 /*
1001                  * Feed the NTP PLL/FLL.
1002                  * The FLL wants to know how many (hardware) nanoseconds
1003                  * elapsed since the previous event.
1004                  */
1005                 tcount = pps->capcount - pps->ppscount[2];
1006                 pps->ppscount[2] = pps->capcount;
1007                 tcount &= pps->capth->th_counter->tc_counter_mask;
1008                 scale = (u_int64_t)1 << 63;
1009                 scale /= pps->capth->th_counter->tc_frequency;
1010                 scale *= 2;
1011                 bt.sec = 0;
1012                 bt.frac = 0;
1013                 bintime_addx(&bt, scale * tcount);
1014                 bintime2timespec(&bt, &ts);
1015                 hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
1016         }
1017 #endif
1018 }
1019
1020 /*
1021  * Timecounters need to be updated every so often to prevent the hardware
1022  * counter from overflowing.  Updating also recalculates the cached values
1023  * used by the get*() family of functions, so their precision depends on
1024  * the update frequency.
1025  */
1026
1027 static int tc_tick;
1028
1029 void
1030 tc_ticktock(void)
1031 {
1032         static int count;
1033
1034         if (++count < tc_tick)
1035                 return;
1036         count = 0;
1037         mutex_spin_enter(&timecounter_lock);
1038         if (timecounter_bad != 0) {
1039                 /* An existing timecounter has gone bad, pick a new one. */
1040                 (void)atomic_swap_uint(&timecounter_bad, 0);
1041                 if (timecounter->tc_quality < 0) {
1042                         tc_pick();
1043                 }
1044         }
1045         tc_windup();
1046         mutex_spin_exit(&timecounter_lock);
1047 }
1048
1049 void
1050 inittimecounter(void)
1051 {
1052         u_int p;
1053
1054         mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH);
1055
1056         /*
1057          * Set the initial timeout to
1058          * max(1, <approx. number of hardclock ticks in a millisecond>).
1059          * People should probably not use the sysctl to set the timeout
1060          * to smaller than its inital value, since that value is the
1061          * smallest reasonable one.  If they want better timestamps they
1062          * should use the non-"get"* functions.
1063          */
1064         if (hz > 1000)
1065                 tc_tick = (hz + 500) / 1000;
1066         else
1067                 tc_tick = 1;
1068         p = (tc_tick * 1000000) / hz;
1069         aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n",
1070             p / 1000, p % 1000);
1071
1072         /* warm up new timecounter (again) and get rolling. */
1073         (void)timecounter->tc_get_timecount(timecounter);
1074         (void)timecounter->tc_get_timecount(timecounter);
1075 }