usr/src/uts/i86pc/os/lgrpplat.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright (c) 2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29
  30 /*
  31  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
  32  * ================================================================
  33  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
  34  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
  35  * one or more CPUs and some local memory.  The CPUs in each node can access
  36  * the memory in the other nodes but at a higher latency than accessing their
  37  * local memory.  Typically, a system with only one node has Uniform Memory
  38  * Access (UMA), but it may be possible to have a one node system that has
  39  * some global memory outside of the node which is higher latency.
  40  *
  41  * Module Description
  42  * ------------------
  43  * This module provides a platform interface for determining which CPUs and
  44  * which memory (and how much) are in a NUMA node and how far each node is from
  45  * each other.  The interface is used by the Virtual Memory (VM) system and the
  46  * common lgroup framework.  The VM system uses the plat_*() routines to fill
  47  * in its memory node (memnode) array with the physical address range spanned
  48  * by each NUMA node to know which memory belongs to which node, so it can
  49  * build and manage a physical page free list for each NUMA node and allocate
  50  * local memory from each node as needed.  The common lgroup framework uses the
  51  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
  52  * to each node (leaf lgroup) and how far each node is from each other, so it
  53  * can build the latency (lgroup) topology for the machine in order to optimize
  54  * for locality.  Also, an lgroup platform handle instead of lgroups are used
  55  * in the interface with this module, so this module shouldn't need to know
  56  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
  57  * etc. are in each NUMA node, how far each node is from each other, and to use
  58  * a unique lgroup platform handle to refer to each node through the interface.
  59  *
  60  * Determining NUMA Configuration
  61  * ------------------------------
  62  * By default, this module will try to determine the NUMA configuration of the
  63  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
  64  * Locality Information Table (SLIT).  The SRAT contains info to tell which
  65  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
  66  * is a matrix that gives the distance between each system locality (which is
  67  * a NUMA node and should correspond to proximity domains in the SRAT).  For
  68  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
  69  * specification.
  70  *
  71  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
  72  * examine registers in PCI configuration space to determine how many nodes are
  73  * in the system and which CPUs and memory are in each node.
  74  * do while booting the kernel.
  75  *
  76  * NOTE: Using these PCI configuration space registers to determine this
  77  *       locality info is not guaranteed to work or be compatible across all
  78  *       Opteron processor families.
  79  *
  80  * If the SLIT does not exist or look right, the kernel will probe to determine
  81  * the distance between nodes as long as the NUMA CPU and memory configuration
  82  * has been determined (see lgrp_plat_probe() for details).
  83  *
  84  * Data Structures
  85  * ---------------
  86  * The main data structures used by this code are the following:
  87  *
  88  * - lgrp_plat_cpu_node[]               CPU to node ID mapping table indexed by
  89  *                                      CPU ID (only used for SRAT)
  90  *
  91  * - lgrp_plat_lat_stats.latencies[][]  Table of latencies between same and
  92  *                                      different nodes indexed by node ID
  93  *
  94  * - lgrp_plat_node_cnt                 Number of NUMA nodes in system for
  95  *                                      non-DR-capable systems,
  96  *                                      maximum possible number of NUMA nodes
  97  *                                      in system for DR capable systems.
  98  *
  99  * - lgrp_plat_node_domain[]            Node ID to proximity domain ID mapping
 100  *                                      table indexed by node ID (only used
 101  *                                      for SRAT)
 102  *
 103  * - lgrp_plat_memnode_info[]           Table with physical address range for
 104  *                                      each memory node indexed by memory node
 105  *                                      ID
 106  *
 107  * The code is implemented to make the following always be true:
 108  *
 109  *      lgroup platform handle == node ID == memnode ID
 110  *
 111  * Moreover, it allows for the proximity domain ID to be equal to all of the
 112  * above as long as the proximity domains IDs are numbered from 0 to <number of
 113  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
 114  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
 115  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
 116  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
 117  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
 118  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
 119  * to node IDs.  However, the proximity domain IDs may not map to the
 120  * equivalent node ID since we want to keep the node IDs numbered from 0 to
 121  * <number of nodes - 1> to minimize cost of searching and potentially space.
 122  *
 123  * With the introduction of support of memory DR operations on x86 platforms,
 124  * things get a little complicated. The addresses of hot-added memory may not
 125  * be continuous with other memory connected to the same lgrp node. In other
 126  * words, memory addresses may get interleaved among lgrp nodes after memory
 127  * DR operations. To work around this limitation, we have extended the
 128  * relationship between lgrp node and memory node from 1:1 map to 1:N map,
 129  * that means there may be multiple memory nodes associated with a lgrp node
 130  * after memory DR operations.
 131  *
 132  * To minimize the code changes to support memory DR operations, the
 133  * following policies have been adopted.
 134  * 1) On non-DR-capable systems, the relationship among lgroup platform handle,
 135  *    node ID and memnode ID is still kept as:
 136  *      lgroup platform handle == node ID == memnode ID
 137  * 2) For memory present at boot time on DR capable platforms, the relationship
 138  *    is still kept as is.
 139  *      lgroup platform handle == node ID == memnode ID
 140  * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have
 141  *    been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt)
 142  *    are reserved for memory present at boot time, and memnode IDs
 143  *    [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate
 144  *    memnode ID for hot-added memory.
 145  * 4) All boot code having the assumption "node ID == memnode ID" can live as
 146  *    is, that's because node ID is always equal to memnode ID at boot time.
 147  * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and
 148  *    lgrp_plat_mem_size() related logics have been enhanced to deal with
 149  *    the 1:N map relationship.
 150  * 6) The latency probing related logics, which have the assumption
 151  *    "node ID == memnode ID" and may be called at run time, is disabled if
 152  *    memory DR operation is enabled.
 153  */
 154
 155
 156 #include <sys/archsystm.h>      /* for {in,out}{b,w,l}() */
 157 #include <sys/atomic.h>
 158 #include <sys/bootconf.h>
 159 #include <sys/cmn_err.h>
 160 #include <sys/controlregs.h>
 161 #include <sys/cpupart.h>
 162 #include <sys/cpuvar.h>
 163 #include <sys/lgrp.h>
 164 #include <sys/machsystm.h>
 165 #include <sys/memlist.h>
 166 #include <sys/memnode.h>
 167 #include <sys/mman.h>
 168 #include <sys/note.h>
 169 #include <sys/pci_cfgspace.h>
 170 #include <sys/pci_impl.h>
 171 #include <sys/param.h>
 172 #include <sys/pghw.h>
 173 #include <sys/promif.h>         /* for prom_printf() */
 174 #include <sys/sysmacros.h>
 175 #include <sys/systm.h>
 176 #include <sys/thread.h>
 177 #include <sys/types.h>
 178 #include <sys/var.h>
 179 #include <sys/x86_archext.h>
 180 #include <vm/hat_i86.h>
 181 #include <vm/seg_kmem.h>
 182 #include <vm/vm_dep.h>
 183
 184 #include <sys/acpidev.h>
 185 #include <sys/acpi/acpi.h>              /* for SRAT, SLIT and MSCT */
 186
 187 /* from fakebop.c */
 188 extern ACPI_TABLE_SRAT *srat_ptr;
 189 extern ACPI_TABLE_SLIT *slit_ptr;
 190 extern ACPI_TABLE_MSCT *msct_ptr;
 191
 192 #define MAX_NODES               8
 193 #define NLGRP                   (MAX_NODES * (MAX_NODES - 1) + 1)
 194
 195 /*
 196  * Constants for configuring probing
 197  */
 198 #define LGRP_PLAT_PROBE_NROUNDS         64      /* default laps for probing */
 199 #define LGRP_PLAT_PROBE_NSAMPLES        1       /* default samples to take */
 200 #define LGRP_PLAT_PROBE_NREADS          256     /* number of vendor ID reads */
 201
 202 /*
 203  * Flags for probing
 204  */
 205 #define LGRP_PLAT_PROBE_ENABLE          0x1     /* enable probing */
 206 #define LGRP_PLAT_PROBE_PGCPY           0x2     /* probe using page copy */
 207 #define LGRP_PLAT_PROBE_VENDOR          0x4     /* probe vendor ID register */
 208
 209 /*
 210  * Hash proximity domain ID into node to domain mapping table "mod" number of
 211  * nodes to minimize span of entries used and try to have lowest numbered
 212  * proximity domain be node 0
 213  */
 214 #define NODE_DOMAIN_HASH(domain, node_cnt) \
 215         ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
 216             ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
 217
 218 /*
 219  * CPU to node ID mapping structure (only used with SRAT)
 220  */
 221 typedef struct cpu_node_map {
 222         int             exists;
 223         uint_t          node;
 224         uint32_t        apicid;
 225         uint32_t        prox_domain;
 226 } cpu_node_map_t;
 227
 228 /*
 229  * Latency statistics
 230  */
 231 typedef struct lgrp_plat_latency_stats {
 232         hrtime_t        latencies[MAX_NODES][MAX_NODES];
 233         hrtime_t        latency_max;
 234         hrtime_t        latency_min;
 235 } lgrp_plat_latency_stats_t;
 236
 237 /*
 238  * Memory configuration for probing
 239  */
 240 typedef struct lgrp_plat_probe_mem_config {
 241         size_t  probe_memsize;          /* how much memory to probe per node */
 242         caddr_t probe_va[MAX_NODES];    /* where memory mapped for probing */
 243         pfn_t   probe_pfn[MAX_NODES];   /* physical pages to map for probing */
 244 } lgrp_plat_probe_mem_config_t;
 245
 246 /*
 247  * Statistics kept for probing
 248  */
 249 typedef struct lgrp_plat_probe_stats {
 250         hrtime_t        flush_cost;
 251         hrtime_t        probe_cost;
 252         hrtime_t        probe_cost_total;
 253         hrtime_t        probe_error_code;
 254         hrtime_t        probe_errors[MAX_NODES][MAX_NODES];
 255         int             probe_suspect[MAX_NODES][MAX_NODES];
 256         hrtime_t        probe_max[MAX_NODES][MAX_NODES];
 257         hrtime_t        probe_min[MAX_NODES][MAX_NODES];
 258 } lgrp_plat_probe_stats_t;
 259
 260 /*
 261  * Node to proximity domain ID mapping structure (only used with SRAT)
 262  */
 263 typedef struct node_domain_map {
 264         int             exists;
 265         uint32_t        prox_domain;
 266 } node_domain_map_t;
 267
 268 /*
 269  * Node ID and starting and ending page for physical memory in memory node
 270  */
 271 typedef struct memnode_phys_addr_map {
 272         pfn_t           start;
 273         pfn_t           end;
 274         int             exists;
 275         uint32_t        prox_domain;
 276         uint32_t        device_id;
 277         uint_t          lgrphand;
 278 } memnode_phys_addr_map_t;
 279
 280 /*
 281  * Number of CPUs for which we got APIC IDs
 282  */
 283 static int                              lgrp_plat_apic_ncpus = 0;
 284
 285 /*
 286  * CPU to node ID mapping table (only used for SRAT) and its max number of
 287  * entries
 288  */
 289 static cpu_node_map_t                   *lgrp_plat_cpu_node = NULL;
 290 static uint_t                           lgrp_plat_cpu_node_nentries = 0;
 291
 292 /*
 293  * Latency statistics
 294  */
 295 lgrp_plat_latency_stats_t               lgrp_plat_lat_stats;
 296
 297 /*
 298  * Whether memory is interleaved across nodes causing MPO to be disabled
 299  */
 300 static int                              lgrp_plat_mem_intrlv = 0;
 301
 302 /*
 303  * Node ID to proximity domain ID mapping table (only used for SRAT)
 304  */
 305 static node_domain_map_t                lgrp_plat_node_domain[MAX_NODES];
 306
 307 /*
 308  * Physical address range for memory in each node
 309  */
 310 static memnode_phys_addr_map_t          lgrp_plat_memnode_info[MAX_MEM_NODES];
 311
 312 /*
 313  * Statistics gotten from probing
 314  */
 315 static lgrp_plat_probe_stats_t          lgrp_plat_probe_stats;
 316
 317 /*
 318  * Memory configuration for probing
 319  */
 320 static lgrp_plat_probe_mem_config_t     lgrp_plat_probe_mem_config;
 321
 322 /*
 323  * Lowest proximity domain ID seen in ACPI SRAT
 324  */
 325 static uint32_t                         lgrp_plat_prox_domain_min = UINT32_MAX;
 326
 327 /*
 328  * Error code from processing ACPI SRAT
 329  */
 330 static int                              lgrp_plat_srat_error = 0;
 331
 332 /*
 333  * Error code from processing ACPI SLIT
 334  */
 335 static int                              lgrp_plat_slit_error = 0;
 336
 337 /*
 338  * Whether lgrp topology has been flattened to 2 levels.
 339  */
 340 static int                              lgrp_plat_topo_flatten = 0;
 341
 342
 343 /*
 344  * Maximum memory node ID in use.
 345  */
 346 static uint_t                           lgrp_plat_max_mem_node;
 347
 348 /*
 349  * Allocate lgroup array statically
 350  */
 351 static lgrp_t                           lgrp_space[NLGRP];
 352 static int                              nlgrps_alloc;
 353
 354
 355 /*
 356  * Enable finding and using minimum proximity domain ID when hashing
 357  */
 358 int                     lgrp_plat_domain_min_enable = 1;
 359
 360 /*
 361  * Maximum possible number of nodes in system
 362  */
 363 uint_t                  lgrp_plat_node_cnt = 1;
 364
 365 /*
 366  * Enable sorting nodes in ascending order by starting physical address
 367  */
 368 int                     lgrp_plat_node_sort_enable = 1;
 369
 370 /*
 371  * Configuration Parameters for Probing
 372  * - lgrp_plat_probe_flags      Flags to specify enabling probing, probe
 373  *                              operation, etc.
 374  * - lgrp_plat_probe_nrounds    How many rounds of probing to do
 375  * - lgrp_plat_probe_nsamples   Number of samples to take when probing each
 376  *                              node
 377  * - lgrp_plat_probe_nreads     Number of times to read vendor ID from
 378  *                              Northbridge for each probe
 379  */
 380 uint_t                  lgrp_plat_probe_flags = 0;
 381 int                     lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
 382 int                     lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
 383 int                     lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
 384
 385 /*
 386  * Enable use of ACPI System Resource Affinity Table (SRAT), System
 387  * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT)
 388  */
 389 int                     lgrp_plat_srat_enable = 1;
 390 int                     lgrp_plat_slit_enable = 1;
 391 int                     lgrp_plat_msct_enable = 1;
 392
 393 /*
 394  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
 395  * found to be crossing memory node boundaries. The workaround will eliminate
 396  * a base size page at the end of each memory node boundary to ensure that
 397  * a large page with constituent pages that span more than 1 memory node
 398  * can never be formed.
 399  *
 400  */
 401 int     mnode_xwa = 1;
 402
 403 /*
 404  * Static array to hold lgroup statistics
 405  */
 406 struct lgrp_stats       lgrp_stats[NLGRP];
 407
 408
 409 /*
 410  * Forward declarations of platform interface routines
 411  */
 412 void            plat_build_mem_nodes(struct memlist *list);
 413
 414 int             plat_mnode_xcheck(pfn_t pfncnt);
 415
 416 lgrp_handle_t   plat_mem_node_to_lgrphand(int mnode);
 417
 418 int             plat_pfn_to_mem_node(pfn_t pfn);
 419
 420 /*
 421  * Forward declarations of lgroup platform interface routines
 422  */
 423 lgrp_t          *lgrp_plat_alloc(lgrp_id_t lgrpid);
 424
 425 void            lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
 426
 427 lgrp_handle_t   lgrp_plat_cpu_to_hand(processorid_t id);
 428
 429 void            lgrp_plat_init(lgrp_init_stages_t stage);
 430
 431 int             lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
 432
 433 int             lgrp_plat_max_lgrps(void);
 434
 435 pgcnt_t         lgrp_plat_mem_size(lgrp_handle_t plathand,
 436     lgrp_mem_query_t query);
 437
 438 lgrp_handle_t   lgrp_plat_pfn_to_hand(pfn_t pfn);
 439
 440 void            lgrp_plat_probe(void);
 441
 442 lgrp_handle_t   lgrp_plat_root_hand(void);
 443
 444
 445 /*
 446  * Forward declarations of local routines
 447  */
 448 static int      is_opteron(void);
 449
 450 static int      lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
 451     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
 452     uint32_t domain);
 453
 454 static int      lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
 455     int cpu_node_nentries);
 456
 457 static int      lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
 458     int node_cnt, uint32_t domain);
 459
 460 static void     lgrp_plat_get_numa_config(void);
 461
 462 static void     lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
 463     lgrp_plat_latency_stats_t *lat_stats,
 464     lgrp_plat_probe_stats_t *probe_stats);
 465
 466 static int      lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
 467     lgrp_plat_latency_stats_t *lat_stats);
 468
 469 static void     lgrp_plat_main_init(void);
 470
 471 static pgcnt_t  lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
 472
 473 static int      lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
 474     int node_cnt, uint32_t domain);
 475
 476 static int      lgrp_plat_memnode_info_update(node_domain_map_t *node_domain,
 477     int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt,
 478     uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id);
 479
 480 static void     lgrp_plat_node_sort(node_domain_map_t *node_domain,
 481     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
 482     memnode_phys_addr_map_t *memnode_info);
 483
 484 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
 485     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
 486     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
 487
 488 static int      lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
 489
 490 static int      lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
 491     node_domain_map_t *node_domain, uint_t node_cnt,
 492     memnode_phys_addr_map_t *memnode_info,
 493     lgrp_plat_latency_stats_t *lat_stats);
 494
 495 static int      lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info,
 496     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
 497     lgrp_plat_latency_stats_t *lat_stats);
 498
 499 static int      lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
 500     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
 501     cpu_node_map_t *cpu_node, int cpu_count,
 502     memnode_phys_addr_map_t *memnode_info);
 503
 504 static void     lgrp_plat_release_bootstrap(void);
 505
 506 static int      lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp,
 507     uint32_t *prox_domain_min);
 508
 509 static int      lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp,
 510     uint32_t *prox_domain_min);
 511
 512 static void     lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats);
 513
 514 static void     opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
 515     memnode_phys_addr_map_t *memnode_info);
 516
 517 static hrtime_t opt_probe_vendor(int dest_node, int nreads);
 518
 519
 520 /*
 521  * PLATFORM INTERFACE ROUTINES
 522  */
 523
 524 /*
 525  * Configure memory nodes for machines with more than one node (ie NUMA)
 526  */
 527 void
 528 plat_build_mem_nodes(struct memlist *list)
 529 {
 530         pfn_t           cur_start;      /* start addr of subrange */
 531         pfn_t           cur_end;        /* end addr of subrange */
 532         pfn_t           start;          /* start addr of whole range */
 533         pfn_t           end;            /* end addr of whole range */
 534         pgcnt_t         endcnt;         /* pages to sacrifice */
 535
 536         /*
 537          * Boot install lists are arranged <addr, len>, ...
 538          */
 539         while (list) {
 540                 int     node;
 541
 542                 start = list->ml_address >> PAGESHIFT;
 543                 end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
 544
 545                 if (start > physmax) {
 546                         list = list->ml_next;
 547                         continue;
 548                 }
 549                 if (end > physmax)
 550                         end = physmax;
 551
 552                 /*
 553                  * When there is only one memnode, just add memory to memnode
 554                  */
 555                 if (max_mem_nodes == 1) {
 556                         mem_node_add_slice(start, end);
 557                         list = list->ml_next;
 558                         continue;
 559                 }
 560
 561                 /*
 562                  * mem_node_add_slice() expects to get a memory range that
 563                  * is within one memnode, so need to split any memory range
 564                  * that spans multiple memnodes into subranges that are each
 565                  * contained within one memnode when feeding them to
 566                  * mem_node_add_slice()
 567                  */
 568                 cur_start = start;
 569                 do {
 570                         node = plat_pfn_to_mem_node(cur_start);
 571
 572                         /*
 573                          * Panic if DRAM address map registers or SRAT say
 574                          * memory in node doesn't exist or address from
 575                          * boot installed memory list entry isn't in this node.
 576                          * This shouldn't happen and rest of code can't deal
 577                          * with this if it does.
 578                          */
 579                         if (node < 0 || node >= lgrp_plat_max_mem_node ||
 580                             !lgrp_plat_memnode_info[node].exists ||
 581                             cur_start < lgrp_plat_memnode_info[node].start ||
 582                             cur_start > lgrp_plat_memnode_info[node].end) {
 583                                 cmn_err(CE_PANIC, "Don't know which memnode "
 584                                     "to add installed memory address 0x%lx\n",
 585                                     cur_start);
 586                         }
 587
 588                         /*
 589                          * End of current subrange should not span memnodes
 590                          */
 591                         cur_end = end;
 592                         endcnt = 0;
 593                         if (lgrp_plat_memnode_info[node].exists &&
 594                             cur_end > lgrp_plat_memnode_info[node].end) {
 595                                 cur_end = lgrp_plat_memnode_info[node].end;
 596                                 if (mnode_xwa > 1) {
 597                                         /*
 598                                          * sacrifice the last page in each
 599                                          * node to eliminate large pages
 600                                          * that span more than 1 memory node.
 601                                          */
 602                                         endcnt = 1;
 603                                         physinstalled--;
 604                                 }
 605                         }
 606
 607                         mem_node_add_slice(cur_start, cur_end - endcnt);
 608
 609                         /*
 610                          * Next subrange starts after end of current one
 611                          */
 612                         cur_start = cur_end + 1;
 613                 } while (cur_end < end);
 614
 615                 list = list->ml_next;
 616         }
 617         mem_node_physalign = 0;
 618         mem_node_pfn_shift = 0;
 619 }
 620
 621
 622 /*
 623  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
 624  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
 625  * a crossing is found and returns 0 otherwise.
 626  */
 627 int
 628 plat_mnode_xcheck(pfn_t pfncnt)
 629 {
 630         int     node, prevnode = -1, basenode;
 631         pfn_t   ea, sa;
 632
 633         for (node = 0; node < lgrp_plat_max_mem_node; node++) {
 634
 635                 if (lgrp_plat_memnode_info[node].exists == 0)
 636                         continue;
 637
 638                 if (prevnode == -1) {
 639                         prevnode = node;
 640                         basenode = node;
 641                         continue;
 642                 }
 643
 644                 /* assume x86 node pfn ranges are in increasing order */
 645                 ASSERT(lgrp_plat_memnode_info[node].start >
 646                     lgrp_plat_memnode_info[prevnode].end);
 647
 648                 /*
 649                  * continue if the starting address of node is not contiguous
 650                  * with the previous node.
 651                  */
 652
 653                 if (lgrp_plat_memnode_info[node].start !=
 654                     (lgrp_plat_memnode_info[prevnode].end + 1)) {
 655                         basenode = node;
 656                         prevnode = node;
 657                         continue;
 658                 }
 659
 660                 /* check if the starting address of node is pfncnt aligned */
 661                 if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) {
 662
 663                         /*
 664                          * at this point, node starts at an unaligned boundary
 665                          * and is contiguous with the previous node(s) to
 666                          * basenode. Check if there is an aligned contiguous
 667                          * range of length pfncnt that crosses this boundary.
 668                          */
 669
 670                         sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end,
 671                             pfncnt);
 672                         ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start),
 673                             pfncnt);
 674
 675                         ASSERT((ea - sa) == pfncnt);
 676                         if (sa >= lgrp_plat_memnode_info[basenode].start &&
 677                             ea <= (lgrp_plat_memnode_info[node].end + 1)) {
 678                                 /*
 679                                  * large page found to cross mnode boundary.
 680                                  * Return Failure if workaround not enabled.
 681                                  */
 682                                 if (mnode_xwa == 0)
 683                                         return (1);
 684                                 mnode_xwa++;
 685                         }
 686                 }
 687                 prevnode = node;
 688         }
 689         return (0);
 690 }
 691
 692
 693 lgrp_handle_t
 694 plat_mem_node_to_lgrphand(int mnode)
 695 {
 696         if (max_mem_nodes == 1)
 697                 return (LGRP_DEFAULT_HANDLE);
 698
 699         ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node);
 700
 701         return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand));
 702 }
 703
 704 int
 705 plat_pfn_to_mem_node(pfn_t pfn)
 706 {
 707         int     node;
 708
 709         if (max_mem_nodes == 1)
 710                 return (0);
 711
 712         for (node = 0; node < lgrp_plat_max_mem_node; node++) {
 713                 /*
 714                  * Skip nodes with no memory
 715                  */
 716                 if (!lgrp_plat_memnode_info[node].exists)
 717                         continue;
 718
 719                 membar_consumer();
 720                 if (pfn >= lgrp_plat_memnode_info[node].start &&
 721                     pfn <= lgrp_plat_memnode_info[node].end)
 722                         return (node);
 723         }
 724
 725         /*
 726          * Didn't find memnode where this PFN lives which should never happen
 727          */
 728         ASSERT(node < lgrp_plat_max_mem_node);
 729         return (-1);
 730 }
 731
 732
 733 /*
 734  * LGROUP PLATFORM INTERFACE ROUTINES
 735  */
 736
 737 /*
 738  * Allocate additional space for an lgroup.
 739  */
 740 lgrp_t *
 741 lgrp_plat_alloc(lgrp_id_t lgrpid)
 742 {
 743         lgrp_t *lgrp;
 744
 745         lgrp = &lgrp_space[nlgrps_alloc++];
 746         if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
 747                 return (NULL);
 748         return (lgrp);
 749 }
 750
 751
 752 /*
 753  * Platform handling for (re)configuration changes
 754  *
 755  * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug:
 756  * 1) Use cpu_lock to synchronize between lgrp_plat_config() and
 757  *    lgrp_plat_cpu_to_hand().
 758  * 2) Disable latency probing logic by making sure that the flag
 759  *    LGRP_PLAT_PROBE_ENABLE is cleared.
 760  *
 761  * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug:
 762  * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal.
 763  * 2) Only expansion to existing entries, no shrinking.
 764  * 3) On writing side, DR framework ensures that lgrp_plat_config() is called
 765  *    in single-threaded context. And membar_producer() is used to ensure that
 766  *    all changes are visible to other CPUs before setting the "exists" flag.
 767  * 4) On reading side, membar_consumer() after checking the "exists" flag
 768  *    ensures that right values are retrieved.
 769  *
 770  * Mechanism to protect lgrp_plat_node_domain[] at hotplug:
 771  * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal.
 772  * 2) On writing side, it's single-threaded and membar_producer() is used to
 773  *    ensure all changes are visible to other CPUs before setting the "exists"
 774  *    flag.
 775  * 3) On reading side, membar_consumer() after checking the "exists" flag
 776  *    ensures that right values are retrieved.
 777  */
 778 void
 779 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
 780 {
 781 #ifdef  __xpv
 782         _NOTE(ARGUNUSED(flag, arg));
 783 #else
 784         int     rc, node;
 785         cpu_t   *cp;
 786         void    *hdl = NULL;
 787         uchar_t *sliptr = NULL;
 788         uint32_t domain, apicid, slicnt = 0;
 789         update_membounds_t *mp;
 790
 791         extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *,
 792             uint32_t *, uint32_t *, uchar_t **);
 793         extern void acpidev_dr_free_cpu_numa_info(void *);
 794
 795         /*
 796          * This interface is used to support CPU/memory DR operations.
 797          * Don't bother here if it's still during boot or only one lgrp node
 798          * is supported.
 799          */
 800         if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1)
 801                 return;
 802
 803         switch (flag) {
 804         case LGRP_CONFIG_CPU_ADD:
 805                 cp = (cpu_t *)arg;
 806                 ASSERT(cp != NULL);
 807                 ASSERT(MUTEX_HELD(&cpu_lock));
 808
 809                 /* Check whether CPU already exists. */
 810                 ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists);
 811                 if (lgrp_plat_cpu_node[cp->cpu_id].exists) {
 812                         cmn_err(CE_WARN,
 813                             "!lgrp: CPU(%d) already exists in cpu_node map.",
 814                             cp->cpu_id);
 815                         break;
 816                 }
 817
 818                 /* Query CPU lgrp information. */
 819                 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
 820                     &slicnt, &sliptr);
 821                 ASSERT(rc == 0);
 822                 if (rc != 0) {
 823                         cmn_err(CE_WARN,
 824                             "!lgrp: failed to query lgrp info for CPU(%d).",
 825                             cp->cpu_id);
 826                         break;
 827                 }
 828
 829                 /* Update node to proximity domain mapping */
 830                 node = lgrp_plat_domain_to_node(lgrp_plat_node_domain,
 831                     lgrp_plat_node_cnt, domain);
 832                 if (node == -1) {
 833                         node = lgrp_plat_node_domain_update(
 834                             lgrp_plat_node_domain, lgrp_plat_node_cnt, domain);
 835                         ASSERT(node != -1);
 836                         if (node == -1) {
 837                                 acpidev_dr_free_cpu_numa_info(hdl);
 838                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 839                                     "node_domain map for domain(%u).", domain);
 840                                 break;
 841                         }
 842                 }
 843
 844                 /* Update latency information among lgrps. */
 845                 if (slicnt != 0 && sliptr != NULL) {
 846                         if (lgrp_plat_process_sli(domain, sliptr, slicnt,
 847                             lgrp_plat_node_domain, lgrp_plat_node_cnt,
 848                             &lgrp_plat_lat_stats) != 0) {
 849                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 850                                     "latency information for domain (%u).",
 851                                     domain);
 852                         }
 853                 }
 854
 855                 /* Update CPU to node mapping. */
 856                 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain;
 857                 lgrp_plat_cpu_node[cp->cpu_id].node = node;
 858                 lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid;
 859                 lgrp_plat_cpu_node[cp->cpu_id].exists = 1;
 860                 lgrp_plat_apic_ncpus++;
 861
 862                 acpidev_dr_free_cpu_numa_info(hdl);
 863                 break;
 864
 865         case LGRP_CONFIG_CPU_DEL:
 866                 cp = (cpu_t *)arg;
 867                 ASSERT(cp != NULL);
 868                 ASSERT(MUTEX_HELD(&cpu_lock));
 869
 870                 /* Check whether CPU exists. */
 871                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists);
 872                 if (!lgrp_plat_cpu_node[cp->cpu_id].exists) {
 873                         cmn_err(CE_WARN,
 874                             "!lgrp: CPU(%d) doesn't exist in cpu_node map.",
 875                             cp->cpu_id);
 876                         break;
 877                 }
 878
 879                 /* Query CPU lgrp information. */
 880                 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
 881                     NULL, NULL);
 882                 ASSERT(rc == 0);
 883                 if (rc != 0) {
 884                         cmn_err(CE_WARN,
 885                             "!lgrp: failed to query lgrp info for CPU(%d).",
 886                             cp->cpu_id);
 887                         break;
 888                 }
 889
 890                 /* Update map. */
 891                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid);
 892                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain);
 893                 lgrp_plat_cpu_node[cp->cpu_id].exists = 0;
 894                 lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX;
 895                 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX;
 896                 lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX;
 897                 lgrp_plat_apic_ncpus--;
 898
 899                 acpidev_dr_free_cpu_numa_info(hdl);
 900                 break;
 901
 902         case LGRP_CONFIG_MEM_ADD:
 903                 mp = (update_membounds_t *)arg;
 904                 ASSERT(mp != NULL);
 905
 906                 /* Update latency information among lgrps. */
 907                 if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) {
 908                         if (lgrp_plat_process_sli(mp->u_domain,
 909                             mp->u_sli_ptr, mp->u_sli_cnt,
 910                             lgrp_plat_node_domain, lgrp_plat_node_cnt,
 911                             &lgrp_plat_lat_stats) != 0) {
 912                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 913                                     "latency information for domain (%u).",
 914                                     domain);
 915                         }
 916                 }
 917
 918                 if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain,
 919                     lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes,
 920                     mp->u_base, mp->u_base + mp->u_length,
 921                     mp->u_domain, mp->u_device_id) < 0) {
 922                         cmn_err(CE_WARN,
 923                             "!lgrp: failed to update latency  information for "
 924                             "memory (0x%" PRIx64 " - 0x%" PRIx64 ").",
 925                             mp->u_base, mp->u_base + mp->u_length);
 926                 }
 927                 break;
 928
 929         default:
 930                 break;
 931         }
 932 #endif  /* __xpv */
 933 }
 934
 935
 936 /*
 937  * Return the platform handle for the lgroup containing the given CPU
 938  */
 939 lgrp_handle_t
 940 lgrp_plat_cpu_to_hand(processorid_t id)
 941 {
 942         lgrp_handle_t   hand;
 943
 944         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 945
 946         if (lgrp_plat_node_cnt == 1)
 947                 return (LGRP_DEFAULT_HANDLE);
 948
 949         hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
 950             lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
 951
 952         ASSERT(hand != (lgrp_handle_t)-1);
 953         if (hand == (lgrp_handle_t)-1)
 954                 return (LGRP_NULL_HANDLE);
 955
 956         return (hand);
 957 }
 958
 959
 960 /*
 961  * Platform-specific initialization of lgroups
 962  */
 963 void
 964 lgrp_plat_init(lgrp_init_stages_t stage)
 965 {
 966 #if defined(__xpv)
 967 #else   /* __xpv */
 968         u_longlong_t    value;
 969 #endif  /* __xpv */
 970
 971         switch (stage) {
 972         case LGRP_INIT_STAGE1:
 973 #if defined(__xpv)
 974                 /*
 975                  * XXPV For now, the hypervisor treats all memory equally.
 976                  */
 977                 lgrp_plat_node_cnt = max_mem_nodes = 1;
 978 #else   /* __xpv */
 979
 980                 /*
 981                  * Get boot property for lgroup topology height limit
 982                  */
 983                 if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
 984                         (void) lgrp_topo_ht_limit_set((int)value);
 985
 986                 /*
 987                  * Get boot property for enabling/disabling SRAT
 988                  */
 989                 if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
 990                         lgrp_plat_srat_enable = (int)value;
 991
 992                 /*
 993                  * Get boot property for enabling/disabling SLIT
 994                  */
 995                 if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
 996                         lgrp_plat_slit_enable = (int)value;
 997
 998                 /*
 999                  * Get boot property for enabling/disabling MSCT
1000                  */
1001                 if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0)
1002                         lgrp_plat_msct_enable = (int)value;
1003
1004                 /*
1005                  * Initialize as a UMA machine
1006                  */
1007                 if (lgrp_topo_ht_limit() == 1) {
1008                         lgrp_plat_node_cnt = max_mem_nodes = 1;
1009                         lgrp_plat_max_mem_node = 1;
1010                         return;
1011                 }
1012
1013                 lgrp_plat_get_numa_config();
1014
1015                 /*
1016                  * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes
1017                  * to support memory DR operations if memory DR is enabled.
1018                  */
1019                 lgrp_plat_max_mem_node = lgrp_plat_node_cnt;
1020                 if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) {
1021                         max_mem_nodes = MAX_MEM_NODES_PER_LGROUP *
1022                             lgrp_plat_node_cnt;
1023                         ASSERT(max_mem_nodes <= MAX_MEM_NODES);
1024                 }
1025 #endif  /* __xpv */
1026                 break;
1027
1028         case LGRP_INIT_STAGE3:
1029                 lgrp_plat_probe();
1030                 lgrp_plat_release_bootstrap();
1031                 break;
1032
1033         case LGRP_INIT_STAGE4:
1034                 lgrp_plat_main_init();
1035                 break;
1036
1037         default:
1038                 break;
1039         }
1040 }
1041
1042
1043 /*
1044  * Return latency between "from" and "to" lgroups
1045  *
1046  * This latency number can only be used for relative comparison
1047  * between lgroups on the running system, cannot be used across platforms,
1048  * and may not reflect the actual latency.  It is platform and implementation
1049  * specific, so platform gets to decide its value.  It would be nice if the
1050  * number was at least proportional to make comparisons more meaningful though.
1051  */
1052 int
1053 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
1054 {
1055         lgrp_handle_t   src, dest;
1056         int             node;
1057
1058         if (max_mem_nodes == 1)
1059                 return (0);
1060
1061         /*
1062          * Return max latency for root lgroup
1063          */
1064         if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
1065                 return (lgrp_plat_lat_stats.latency_max);
1066
1067         src = from;
1068         dest = to;
1069
1070         /*
1071          * Return 0 for nodes (lgroup platform handles) out of range
1072          */
1073         if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
1074                 return (0);
1075
1076         /*
1077          * Probe from current CPU if its lgroup latencies haven't been set yet
1078          * and we are trying to get latency from current CPU to some node.
1079          * Avoid probing if CPU/memory DR is enabled.
1080          */
1081         if (lgrp_plat_lat_stats.latencies[src][src] == 0) {
1082                 /*
1083                  * Latency information should be updated by lgrp_plat_config()
1084                  * for DR operations. Something is wrong if reaches here.
1085                  * For safety, flatten lgrp topology to two levels.
1086                  */
1087                 if (plat_dr_support_cpu() || plat_dr_support_memory()) {
1088                         ASSERT(lgrp_plat_lat_stats.latencies[src][src]);
1089                         cmn_err(CE_WARN,
1090                             "lgrp: failed to get latency information, "
1091                             "fall back to two-level topology.");
1092                         lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1093                 } else {
1094                         node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1095                             lgrp_plat_cpu_node_nentries);
1096                         ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
1097                         if (node == src)
1098                                 lgrp_plat_probe();
1099                 }
1100         }
1101
1102         return (lgrp_plat_lat_stats.latencies[src][dest]);
1103 }
1104
1105
1106 /*
1107  * Return the maximum number of lgrps supported by the platform.
1108  * Before lgrp topology is known it returns an estimate based on the number of
1109  * nodes. Once topology is known it returns:
1110  * 1) the actual maximim number of lgrps created if CPU/memory DR operations
1111  *    are not suppported.
1112  * 2) the maximum possible number of lgrps if CPU/memory DR operations are
1113  *    supported.
1114  */
1115 int
1116 lgrp_plat_max_lgrps(void)
1117 {
1118         if (!lgrp_topo_initialized || plat_dr_support_cpu() ||
1119             plat_dr_support_memory()) {
1120                 return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
1121         } else {
1122                 return (lgrp_alloc_max + 1);
1123         }
1124 }
1125
1126
1127 /*
1128  * Count number of memory pages (_t) based on mnode id (_n) and query type (_t).
1129  */
1130 #define _LGRP_PLAT_MEM_SIZE(_n, _q, _t)                                 \
1131         if (mem_node_config[_n].exists) {                               \
1132                 switch (_q) {                                           \
1133                 case LGRP_MEM_SIZE_FREE:                                \
1134                         _t += MNODE_PGCNT(_n);                          \
1135                         break;                                          \
1136                 case LGRP_MEM_SIZE_AVAIL:                               \
1137                         _t += mem_node_memlist_pages(_n, phys_avail);   \
1138                                 break;                                  \
1139                 case LGRP_MEM_SIZE_INSTALL:                             \
1140                         _t += mem_node_memlist_pages(_n, phys_install); \
1141                         break;                                          \
1142                 default:                                                \
1143                         break;                                          \
1144                 }                                                       \
1145         }
1146
1147 /*
1148  * Return the number of free pages in an lgroup.
1149  *
1150  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
1151  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
1152  * number of allocatable base pagesize pages corresponding to the
1153  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
1154  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
1155  * memory installed, regardless of whether or not it's usable.
1156  */
1157 pgcnt_t
1158 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
1159 {
1160         int     mnode;
1161         pgcnt_t npgs = (pgcnt_t)0;
1162         extern struct memlist *phys_avail;
1163         extern struct memlist *phys_install;
1164
1165
1166         if (plathand == LGRP_DEFAULT_HANDLE)
1167                 return (lgrp_plat_mem_size_default(plathand, query));
1168
1169         if (plathand != LGRP_NULL_HANDLE) {
1170                 /* Count memory node present at boot. */
1171                 mnode = (int)plathand;
1172                 ASSERT(mnode < lgrp_plat_node_cnt);
1173                 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1174
1175                 /* Count possible hot-added memory nodes. */
1176                 for (mnode = lgrp_plat_node_cnt;
1177                     mnode < lgrp_plat_max_mem_node; mnode++) {
1178                         if (lgrp_plat_memnode_info[mnode].lgrphand == plathand)
1179                                 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1180                 }
1181         }
1182
1183         return (npgs);
1184 }
1185
1186
1187 /*
1188  * Return the platform handle of the lgroup that contains the physical memory
1189  * corresponding to the given page frame number
1190  */
1191 lgrp_handle_t
1192 lgrp_plat_pfn_to_hand(pfn_t pfn)
1193 {
1194         int     mnode;
1195
1196         if (max_mem_nodes == 1)
1197                 return (LGRP_DEFAULT_HANDLE);
1198
1199         if (pfn > physmax)
1200                 return (LGRP_NULL_HANDLE);
1201
1202         mnode = plat_pfn_to_mem_node(pfn);
1203         if (mnode < 0)
1204                 return (LGRP_NULL_HANDLE);
1205
1206         return (MEM_NODE_2_LGRPHAND(mnode));
1207 }
1208
1209
1210 /*
1211  * Probe memory in each node from current CPU to determine latency topology
1212  *
1213  * The probing code will probe the vendor ID register on the Northbridge of
1214  * Opteron processors and probe memory for other processors by default.
1215  *
1216  * Since probing is inherently error prone, the code takes laps across all the
1217  * nodes probing from each node to each of the other nodes some number of
1218  * times.  Furthermore, each node is probed some number of times before moving
1219  * onto the next one during each lap.  The minimum latency gotten between nodes
1220  * is kept as the latency between the nodes.
1221  *
1222  * After all that,  the probe times are adjusted by normalizing values that are
1223  * close to each other and local latencies are made the same.  Lastly, the
1224  * latencies are verified to make sure that certain conditions are met (eg.
1225  * local < remote, latency(a, b) == latency(b, a), etc.).
1226  *
1227  * If any of the conditions aren't met, the code will export a NUMA
1228  * configuration with the local CPUs and memory given by the SRAT or PCI config
1229  * space registers and one remote memory latency since it can't tell exactly
1230  * how far each node is from each other.
1231  */
1232 void
1233 lgrp_plat_probe(void)
1234 {
1235         int                             from;
1236         int                             i;
1237         lgrp_plat_latency_stats_t       *lat_stats;
1238         boolean_t                       probed;
1239         hrtime_t                        probe_time;
1240         int                             to;
1241
1242         if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1243             max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1244                 return;
1245
1246         /* SRAT and SLIT should be enabled if DR operations are enabled. */
1247         if (plat_dr_support_cpu() || plat_dr_support_memory())
1248                 return;
1249
1250         /*
1251          * Determine ID of node containing current CPU
1252          */
1253         from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1254             lgrp_plat_cpu_node_nentries);
1255         ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1256         if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1257                 ASSERT(lgrp_plat_node_domain[from].exists);
1258
1259         /*
1260          * Don't need to probe if got times already
1261          */
1262         lat_stats = &lgrp_plat_lat_stats;
1263         if (lat_stats->latencies[from][from] != 0)
1264                 return;
1265
1266         /*
1267          * Read vendor ID in Northbridge or read and write page(s)
1268          * in each node from current CPU and remember how long it takes,
1269          * so we can build latency topology of machine later.
1270          * This should approximate the memory latency between each node.
1271          */
1272         probed = B_FALSE;
1273         for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1274                 for (to = 0; to < lgrp_plat_node_cnt; to++) {
1275                         /*
1276                          * Get probe time and skip over any nodes that can't be
1277                          * probed yet or don't have memory
1278                          */
1279                         probe_time = lgrp_plat_probe_time(to,
1280                             lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
1281                             &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
1282                             &lgrp_plat_probe_stats);
1283                         if (probe_time == 0)
1284                                 continue;
1285
1286                         probed = B_TRUE;
1287
1288                         /*
1289                          * Keep lowest probe time as latency between nodes
1290                          */
1291                         if (lat_stats->latencies[from][to] == 0 ||
1292                             probe_time < lat_stats->latencies[from][to])
1293                                 lat_stats->latencies[from][to] = probe_time;
1294
1295                         /*
1296                          * Update overall minimum and maximum probe times
1297                          * across all nodes
1298                          */
1299                         if (probe_time < lat_stats->latency_min ||
1300                             lat_stats->latency_min == -1)
1301                                 lat_stats->latency_min = probe_time;
1302                         if (probe_time > lat_stats->latency_max)
1303                                 lat_stats->latency_max = probe_time;
1304                 }
1305         }
1306
1307         /*
1308          * Bail out if weren't able to probe any nodes from current CPU
1309          */
1310         if (probed == B_FALSE)
1311                 return;
1312
1313         /*
1314          * - Fix up latencies such that local latencies are same,
1315          *   latency(i, j) == latency(j, i), etc. (if possible)
1316          *
1317          * - Verify that latencies look ok
1318          *
1319          * - Fallback to just optimizing for local and remote if
1320          *   latencies didn't look right
1321          */
1322         lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats,
1323             &lgrp_plat_probe_stats);
1324         lgrp_plat_probe_stats.probe_error_code =
1325             lgrp_plat_latency_verify(lgrp_plat_memnode_info,
1326             &lgrp_plat_lat_stats);
1327         if (lgrp_plat_probe_stats.probe_error_code)
1328                 lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1329 }
1330
1331
1332 /*
1333  * Return platform handle for root lgroup
1334  */
1335 lgrp_handle_t
1336 lgrp_plat_root_hand(void)
1337 {
1338         return (LGRP_DEFAULT_HANDLE);
1339 }
1340
1341
1342 /*
1343  * INTERNAL ROUTINES
1344  */
1345
1346
1347 /*
1348  * Update CPU to node mapping for given CPU and proximity domain.
1349  * Return values:
1350  *      - zero for success
1351  *      - positive numbers for warnings
1352  *      - negative numbers for errors
1353  */
1354 static int
1355 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
1356     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
1357 {
1358         uint_t  i;
1359         int     node;
1360
1361         /*
1362          * Get node number for proximity domain
1363          */
1364         node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
1365         if (node == -1) {
1366                 node = lgrp_plat_node_domain_update(node_domain, node_cnt,
1367                     domain);
1368                 if (node == -1)
1369                         return (-1);
1370         }
1371
1372         /*
1373          * Search for entry with given APIC ID and fill in its node and
1374          * proximity domain IDs (if they haven't been set already)
1375          */
1376         for (i = 0; i < nentries; i++) {
1377                 /*
1378                  * Skip nonexistent entries and ones without matching APIC ID
1379                  */
1380                 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
1381                         continue;
1382
1383                 /*
1384                  * Just return if entry completely and correctly filled in
1385                  * already
1386                  */
1387                 if (cpu_node[i].prox_domain == domain &&
1388                     cpu_node[i].node == node)
1389                         return (1);
1390
1391                 /*
1392                  * It's invalid to have more than one entry with the same
1393                  * local APIC ID in SRAT table.
1394                  */
1395                 if (cpu_node[i].node != UINT_MAX)
1396                         return (-2);
1397
1398                 /*
1399                  * Fill in node and proximity domain IDs
1400                  */
1401                 cpu_node[i].prox_domain = domain;
1402                 cpu_node[i].node = node;
1403
1404                 return (0);
1405         }
1406
1407         /*
1408          * It's possible that an apicid doesn't exist in the cpu_node map due
1409          * to user limits number of CPUs powered on at boot by specifying the
1410          * boot_ncpus kernel option.
1411          */
1412         return (2);
1413 }
1414
1415
1416 /*
1417  * Get node ID for given CPU
1418  */
1419 static int
1420 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
1421     int cpu_node_nentries)
1422 {
1423         processorid_t   cpuid;
1424
1425         if (cp == NULL)
1426                 return (-1);
1427
1428         cpuid = cp->cpu_id;
1429         if (cpuid < 0 || cpuid >= max_ncpus)
1430                 return (-1);
1431
1432         /*
1433          * SRAT doesn't exist, isn't enabled, or there was an error processing
1434          * it, so return node ID for Opteron and -1 otherwise.
1435          */
1436         if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1437             lgrp_plat_srat_error) {
1438                 if (is_opteron())
1439                         return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
1440                 return (-1);
1441         }
1442
1443         /*
1444          * Return -1 when CPU to node ID mapping entry doesn't exist for given
1445          * CPU
1446          */
1447         if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
1448                 return (-1);
1449
1450         return (cpu_node[cpuid].node);
1451 }
1452
1453
1454 /*
1455  * Return node number for given proximity domain/system locality
1456  */
1457 static int
1458 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
1459     uint32_t domain)
1460 {
1461         uint_t  node;
1462         uint_t  start;
1463
1464         /*
1465          * Hash proximity domain ID into node to domain mapping table (array),
1466          * search for entry with matching proximity domain ID, and return index
1467          * of matching entry as node ID.
1468          */
1469         node = start = NODE_DOMAIN_HASH(domain, node_cnt);
1470         do {
1471                 if (node_domain[node].exists) {
1472                         membar_consumer();
1473                         if (node_domain[node].prox_domain == domain)
1474                                 return (node);
1475                 }
1476                 node = (node + 1) % node_cnt;
1477         } while (node != start);
1478         return (-1);
1479 }
1480
1481
1482 /*
1483  * Get NUMA configuration of machine
1484  */
1485 static void
1486 lgrp_plat_get_numa_config(void)
1487 {
1488         uint_t          probe_op;
1489
1490         /*
1491          * Read boot property with CPU to APIC ID mapping table/array to
1492          * determine number of CPUs
1493          */
1494         lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
1495
1496         /*
1497          * Determine which CPUs and memory are local to each other and number
1498          * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
1499          */
1500         if (lgrp_plat_apic_ncpus > 0) {
1501                 int     retval;
1502
1503                 /* Reserve enough resources if CPU DR is enabled. */
1504                 if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus)
1505                         lgrp_plat_cpu_node_nentries = max_ncpus;
1506                 else
1507                         lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
1508
1509                 /*
1510                  * Temporarily allocate boot memory to use for CPU to node
1511                  * mapping since kernel memory allocator isn't alive yet
1512                  */
1513                 lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
1514                     NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t),
1515                     sizeof (int));
1516
1517                 ASSERT(lgrp_plat_cpu_node != NULL);
1518                 if (lgrp_plat_cpu_node) {
1519                         bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
1520                             sizeof (cpu_node_map_t));
1521                 } else {
1522                         lgrp_plat_cpu_node_nentries = 0;
1523                 }
1524
1525                 /*
1526                  * Fill in CPU to node ID mapping table with APIC ID for each
1527                  * CPU
1528                  */
1529                 (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
1530
1531                 retval = lgrp_plat_process_srat(srat_ptr, msct_ptr,
1532                     &lgrp_plat_prox_domain_min,
1533                     lgrp_plat_node_domain, lgrp_plat_cpu_node,
1534                     lgrp_plat_apic_ncpus, lgrp_plat_memnode_info);
1535                 if (retval <= 0) {
1536                         lgrp_plat_srat_error = retval;
1537                         lgrp_plat_node_cnt = 1;
1538                 } else {
1539                         lgrp_plat_srat_error = 0;
1540                         lgrp_plat_node_cnt = retval;
1541                 }
1542         }
1543
1544         /*
1545          * Try to use PCI config space registers on Opteron if there's an error
1546          * processing CPU to APIC ID mapping or SRAT
1547          */
1548         if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
1549             is_opteron())
1550                 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
1551                     lgrp_plat_memnode_info);
1552
1553         /*
1554          * Don't bother to setup system for multiple lgroups and only use one
1555          * memory node when memory is interleaved between any nodes or there is
1556          * only one NUMA node
1557          */
1558         if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
1559                 lgrp_plat_node_cnt = max_mem_nodes = 1;
1560                 (void) lgrp_topo_ht_limit_set(1);
1561                 return;
1562         }
1563
1564         /*
1565          * Leaf lgroups on x86/x64 architectures contain one physical
1566          * processor chip. Tune lgrp_expand_proc_thresh and
1567          * lgrp_expand_proc_diff so that lgrp_choose() will spread
1568          * things out aggressively.
1569          */
1570         lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
1571         lgrp_expand_proc_diff = 0;
1572
1573         /*
1574          * There should be one memnode (physical page free list(s)) for
1575          * each node if memory DR is disabled.
1576          */
1577         max_mem_nodes = lgrp_plat_node_cnt;
1578
1579         /*
1580          * Initialize min and max latency before reading SLIT or probing
1581          */
1582         lgrp_plat_lat_stats.latency_min = -1;
1583         lgrp_plat_lat_stats.latency_max = 0;
1584
1585         /*
1586          * Determine how far each NUMA node is from each other by
1587          * reading ACPI System Locality Information Table (SLIT) if it
1588          * exists
1589          */
1590         lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
1591             lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info,
1592             &lgrp_plat_lat_stats);
1593
1594         /*
1595          * Disable support of CPU/memory DR operations if multiple locality
1596          * domains exist in system and either of following is true.
1597          * 1) Failed to process SLIT table.
1598          * 2) Latency probing is enabled by user.
1599          */
1600         if (lgrp_plat_node_cnt > 1 &&
1601             (plat_dr_support_cpu() || plat_dr_support_memory())) {
1602                 if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 ||
1603                     !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 ||
1604                     lgrp_plat_apic_ncpus <= 0) {
1605                         cmn_err(CE_CONT,
1606                             "?lgrp: failed to process ACPI SRAT/SLIT table, "
1607                             "disable support of CPU/memory DR operations.");
1608                         plat_dr_disable_cpu();
1609                         plat_dr_disable_memory();
1610                 } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) {
1611                         cmn_err(CE_CONT,
1612                             "?lgrp: latency probing enabled by user, "
1613                             "disable support of CPU/memory DR operations.");
1614                         plat_dr_disable_cpu();
1615                         plat_dr_disable_memory();
1616                 }
1617         }
1618
1619         /* Done if succeeded to process SLIT table. */
1620         if (lgrp_plat_slit_error == 0)
1621                 return;
1622
1623         /*
1624          * Probe to determine latency between NUMA nodes when SLIT
1625          * doesn't exist or make sense
1626          */
1627         lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
1628
1629         /*
1630          * Specify whether to probe using vendor ID register or page copy
1631          * if hasn't been specified already or is overspecified
1632          */
1633         probe_op = lgrp_plat_probe_flags &
1634             (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1635
1636         if (probe_op == 0 ||
1637             probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
1638                 lgrp_plat_probe_flags &=
1639                     ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1640                 if (is_opteron())
1641                         lgrp_plat_probe_flags |=
1642                             LGRP_PLAT_PROBE_VENDOR;
1643                 else
1644                         lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
1645         }
1646
1647         /*
1648          * Probing errors can mess up the lgroup topology and
1649          * force us fall back to a 2 level lgroup topology.
1650          * Here we bound how tall the lgroup topology can grow
1651          * in hopes of avoiding any anamolies in probing from
1652          * messing up the lgroup topology by limiting the
1653          * accuracy of the latency topology.
1654          *
1655          * Assume that nodes will at least be configured in a
1656          * ring, so limit height of lgroup topology to be less
1657          * than number of nodes on a system with 4 or more
1658          * nodes
1659          */
1660         if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
1661             lgrp_topo_ht_limit_default())
1662                 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
1663 }
1664
1665
1666 /*
1667  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1668  * be considered same
1669  */
1670 #define LGRP_LAT_TOLERANCE_SHIFT        4
1671
1672 int     lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1673
1674
1675 /*
1676  * Adjust latencies between nodes to be symmetric, normalize latencies between
1677  * any nodes that are within some tolerance to be same, and make local
1678  * latencies be same
1679  */
1680 static void
1681 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
1682     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1683 {
1684         int                             i;
1685         int                             j;
1686         int                             k;
1687         int                             l;
1688         u_longlong_t                    max;
1689         u_longlong_t                    min;
1690         u_longlong_t                    t;
1691         u_longlong_t                    t1;
1692         u_longlong_t                    t2;
1693         const lgrp_config_flag_t        cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1694         int                             lat_corrected[MAX_NODES][MAX_NODES];
1695
1696         /*
1697          * Nothing to do when this is an UMA machine or don't have args needed
1698          */
1699         if (max_mem_nodes == 1)
1700                 return;
1701
1702         ASSERT(memnode_info != NULL && lat_stats != NULL &&
1703             probe_stats != NULL);
1704
1705         /*
1706          * Make sure that latencies are symmetric between any two nodes
1707          * (ie. latency(node0, node1) == latency(node1, node0))
1708          */
1709         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1710                 if (!memnode_info[i].exists)
1711                         continue;
1712
1713                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1714                         if (!memnode_info[j].exists)
1715                                 continue;
1716
1717                         t1 = lat_stats->latencies[i][j];
1718                         t2 = lat_stats->latencies[j][i];
1719
1720                         if (t1 == 0 || t2 == 0 || t1 == t2)
1721                                 continue;
1722
1723                         /*
1724                          * Latencies should be same
1725                          * - Use minimum of two latencies which should be same
1726                          * - Track suspect probe times not within tolerance of
1727                          *   min value
1728                          * - Remember how much values are corrected by
1729                          */
1730                         if (t1 > t2) {
1731                                 t = t2;
1732                                 probe_stats->probe_errors[i][j] += t1 - t2;
1733                                 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1734                                         probe_stats->probe_suspect[i][j]++;
1735                                         probe_stats->probe_suspect[j][i]++;
1736                                 }
1737                         } else if (t2 > t1) {
1738                                 t = t1;
1739                                 probe_stats->probe_errors[j][i] += t2 - t1;
1740                                 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1741                                         probe_stats->probe_suspect[i][j]++;
1742                                         probe_stats->probe_suspect[j][i]++;
1743                                 }
1744                         }
1745
1746                         lat_stats->latencies[i][j] =
1747                             lat_stats->latencies[j][i] = t;
1748                         lgrp_config(cflag, t1, t);
1749                         lgrp_config(cflag, t2, t);
1750                 }
1751         }
1752
1753         /*
1754          * Keep track of which latencies get corrected
1755          */
1756         for (i = 0; i < MAX_NODES; i++)
1757                 for (j = 0; j < MAX_NODES; j++)
1758                         lat_corrected[i][j] = 0;
1759
1760         /*
1761          * For every two nodes, see whether there is another pair of nodes which
1762          * are about the same distance apart and make the latencies be the same
1763          * if they are close enough together
1764          */
1765         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1766                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1767                         if (!memnode_info[j].exists)
1768                                 continue;
1769                         /*
1770                          * Pick one pair of nodes (i, j)
1771                          * and get latency between them
1772                          */
1773                         t1 = lat_stats->latencies[i][j];
1774
1775                         /*
1776                          * Skip this pair of nodes if there isn't a latency
1777                          * for it yet
1778                          */
1779                         if (t1 == 0)
1780                                 continue;
1781
1782                         for (k = 0; k < lgrp_plat_node_cnt; k++) {
1783                                 for (l = 0; l < lgrp_plat_node_cnt; l++) {
1784                                         if (!memnode_info[l].exists)
1785                                                 continue;
1786                                         /*
1787                                          * Pick another pair of nodes (k, l)
1788                                          * not same as (i, j) and get latency
1789                                          * between them
1790                                          */
1791                                         if (k == i && l == j)
1792                                                 continue;
1793
1794                                         t2 = lat_stats->latencies[k][l];
1795
1796                                         /*
1797                                          * Skip this pair of nodes if there
1798                                          * isn't a latency for it yet
1799                                          */
1800
1801                                         if (t2 == 0)
1802                                                 continue;
1803
1804                                         /*
1805                                          * Skip nodes (k, l) if they already
1806                                          * have same latency as (i, j) or
1807                                          * their latency isn't close enough to
1808                                          * be considered/made the same
1809                                          */
1810                                         if (t1 == t2 || (t1 > t2 && t1 - t2 >
1811                                             t1 >> lgrp_plat_probe_lt_shift) ||
1812                                             (t2 > t1 && t2 - t1 >
1813                                             t2 >> lgrp_plat_probe_lt_shift))
1814                                                 continue;
1815
1816                                         /*
1817                                          * Make latency(i, j) same as
1818                                          * latency(k, l), try to use latency
1819                                          * that has been adjusted already to get
1820                                          * more consistency (if possible), and
1821                                          * remember which latencies were
1822                                          * adjusted for next time
1823                                          */
1824                                         if (lat_corrected[i][j]) {
1825                                                 t = t1;
1826                                                 lgrp_config(cflag, t2, t);
1827                                                 t2 = t;
1828                                         } else if (lat_corrected[k][l]) {
1829                                                 t = t2;
1830                                                 lgrp_config(cflag, t1, t);
1831                                                 t1 = t;
1832                                         } else {
1833                                                 if (t1 > t2)
1834                                                         t = t2;
1835                                                 else
1836                                                         t = t1;
1837                                                 lgrp_config(cflag, t1, t);
1838                                                 lgrp_config(cflag, t2, t);
1839                                                 t1 = t2 = t;
1840                                         }
1841
1842                                         lat_stats->latencies[i][j] =
1843                                             lat_stats->latencies[k][l] = t;
1844
1845                                         lat_corrected[i][j] =
1846                                             lat_corrected[k][l] = 1;
1847                                 }
1848                         }
1849                 }
1850         }
1851
1852         /*
1853          * Local latencies should be same
1854          * - Find min and max local latencies
1855          * - Make all local latencies be minimum
1856          */
1857         min = -1;
1858         max = 0;
1859         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1860                 if (!memnode_info[i].exists)
1861                         continue;
1862                 t = lat_stats->latencies[i][i];
1863                 if (t == 0)
1864                         continue;
1865                 if (min == -1 || t < min)
1866                         min = t;
1867                 if (t > max)
1868                         max = t;
1869         }
1870         if (min != max) {
1871                 for (i = 0; i < lgrp_plat_node_cnt; i++) {
1872                         int     local;
1873
1874                         if (!memnode_info[i].exists)
1875                                 continue;
1876
1877                         local = lat_stats->latencies[i][i];
1878                         if (local == 0)
1879                                 continue;
1880
1881                         /*
1882                          * Track suspect probe times that aren't within
1883                          * tolerance of minimum local latency and how much
1884                          * probe times are corrected by
1885                          */
1886                         if (local - min > min >> lgrp_plat_probe_lt_shift)
1887                                 probe_stats->probe_suspect[i][i]++;
1888
1889                         probe_stats->probe_errors[i][i] += local - min;
1890
1891                         /*
1892                          * Make local latencies be minimum
1893                          */
1894                         lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1895                         lat_stats->latencies[i][i] = min;
1896                 }
1897         }
1898
1899         /*
1900          * Determine max probe time again since just adjusted latencies
1901          */
1902         lat_stats->latency_max = 0;
1903         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1904                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1905                         if (!memnode_info[j].exists)
1906                                 continue;
1907                         t = lat_stats->latencies[i][j];
1908                         if (t > lat_stats->latency_max)
1909                                 lat_stats->latency_max = t;
1910                 }
1911         }
1912 }
1913
1914
1915 /*
1916  * Verify following about latencies between nodes:
1917  *
1918  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1919  * - Local latencies same
1920  * - Local < remote
1921  * - Number of latencies seen is reasonable
1922  * - Number of occurrences of a given latency should be more than 1
1923  *
1924  * Returns:
1925  *      0       Success
1926  *      -1      Not symmetric
1927  *      -2      Local latencies not same
1928  *      -3      Local >= remote
1929  */
1930 static int
1931 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
1932     lgrp_plat_latency_stats_t *lat_stats)
1933 {
1934         int                             i;
1935         int                             j;
1936         u_longlong_t                    t1;
1937         u_longlong_t                    t2;
1938
1939         ASSERT(memnode_info != NULL && lat_stats != NULL);
1940
1941         /*
1942          * Nothing to do when this is an UMA machine, lgroup topology is
1943          * limited to 2 levels, or there aren't any probe times yet
1944          */
1945         if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1946             lat_stats->latencies[0][0] == 0)
1947                 return (0);
1948
1949         /*
1950          * Make sure that latencies are symmetric between any two nodes
1951          * (ie. latency(node0, node1) == latency(node1, node0))
1952          */
1953         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1954                 if (!memnode_info[i].exists)
1955                         continue;
1956                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1957                         if (!memnode_info[j].exists)
1958                                 continue;
1959                         t1 = lat_stats->latencies[i][j];
1960                         t2 = lat_stats->latencies[j][i];
1961
1962                         if (t1 == 0 || t2 == 0 || t1 == t2)
1963                                 continue;
1964
1965                         return (-1);
1966                 }
1967         }
1968
1969         /*
1970          * Local latencies should be same
1971          */
1972         t1 = lat_stats->latencies[0][0];
1973         for (i = 1; i < lgrp_plat_node_cnt; i++) {
1974                 if (!memnode_info[i].exists)
1975                         continue;
1976
1977                 t2 = lat_stats->latencies[i][i];
1978                 if (t2 == 0)
1979                         continue;
1980
1981                 if (t1 == 0) {
1982                         t1 = t2;
1983                         continue;
1984                 }
1985
1986                 if (t1 != t2)
1987                         return (-2);
1988         }
1989
1990         /*
1991          * Local latencies should be less than remote
1992          */
1993         if (t1) {
1994                 for (i = 0; i < lgrp_plat_node_cnt; i++) {
1995                         for (j = 0; j < lgrp_plat_node_cnt; j++) {
1996                                 if (!memnode_info[j].exists)
1997                                         continue;
1998                                 t2 = lat_stats->latencies[i][j];
1999                                 if (i == j || t2 == 0)
2000                                         continue;
2001
2002                                 if (t1 >= t2)
2003                                         return (-3);
2004                         }
2005                 }
2006         }
2007
2008         return (0);
2009 }
2010
2011
2012 /*
2013  * Platform-specific initialization
2014  */
2015 static void
2016 lgrp_plat_main_init(void)
2017 {
2018         int     curnode;
2019         int     ht_limit;
2020         int     i;
2021
2022         /*
2023          * Print a notice that MPO is disabled when memory is interleaved
2024          * across nodes....Would do this when it is discovered, but can't
2025          * because it happens way too early during boot....
2026          */
2027         if (lgrp_plat_mem_intrlv)
2028                 cmn_err(CE_NOTE,
2029                     "MPO disabled because memory is interleaved\n");
2030
2031         /*
2032          * Don't bother to do any probing if it is disabled, there is only one
2033          * node, or the height of the lgroup topology less than or equal to 2
2034          */
2035         ht_limit = lgrp_topo_ht_limit();
2036         if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2037             max_mem_nodes == 1 || ht_limit <= 2) {
2038                 /*
2039                  * Setup lgroup latencies for 2 level lgroup topology
2040                  * (ie. local and remote only) if they haven't been set yet
2041                  */
2042                 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2043                     lgrp_plat_lat_stats.latency_max == 0)
2044                         lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
2045                 return;
2046         }
2047
2048         if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2049                 /*
2050                  * Should have been able to probe from CPU 0 when it was added
2051                  * to lgroup hierarchy, but may not have been able to then
2052                  * because it happens so early in boot that gethrtime() hasn't
2053                  * been initialized.  (:-(
2054                  */
2055                 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
2056                     lgrp_plat_cpu_node_nentries);
2057                 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
2058                 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2059                         lgrp_plat_probe();
2060
2061                 return;
2062         }
2063
2064         /*
2065          * When probing memory, use one page for every sample to determine
2066          * lgroup topology and taking multiple samples
2067          */
2068         if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2069                 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2070                     lgrp_plat_probe_nsamples;
2071
2072         /*
2073          * Map memory in each node needed for probing to determine latency
2074          * topology
2075          */
2076         for (i = 0; i < lgrp_plat_node_cnt; i++) {
2077                 int     mnode;
2078
2079                 /*
2080                  * Skip this node and leave its probe page NULL
2081                  * if it doesn't have any memory
2082                  */
2083                 mnode = i;
2084                 if (!mem_node_config[mnode].exists) {
2085                         lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2086                         continue;
2087                 }
2088
2089                 /*
2090                  * Allocate one kernel virtual page
2091                  */
2092                 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2093                     lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2094                 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2095                         cmn_err(CE_WARN,
2096                             "lgrp_plat_main_init: couldn't allocate memory");
2097                         return;
2098                 }
2099
2100                 /*
2101                  * Get PFN for first page in each node
2102                  */
2103                 lgrp_plat_probe_mem_config.probe_pfn[i] =
2104                     mem_node_config[mnode].physbase;
2105
2106                 /*
2107                  * Map virtual page to first page in node
2108                  */
2109                 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2110                     lgrp_plat_probe_mem_config.probe_memsize,
2111                     lgrp_plat_probe_mem_config.probe_pfn[i],
2112                     PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2113                     HAT_LOAD_NOCONSIST);
2114         }
2115
2116         /*
2117          * Probe from current CPU
2118          */
2119         lgrp_plat_probe();
2120 }
2121
2122
2123 /*
2124  * Return the number of free, allocatable, or installed
2125  * pages in an lgroup
2126  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2127  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2128  */
2129 static pgcnt_t
2130 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2131 {
2132         _NOTE(ARGUNUSED(lgrphand));
2133
2134         struct memlist *mlist;
2135         pgcnt_t npgs = 0;
2136         extern struct memlist *phys_avail;
2137         extern struct memlist *phys_install;
2138
2139         switch (query) {
2140         case LGRP_MEM_SIZE_FREE:
2141                 return ((pgcnt_t)freemem);
2142         case LGRP_MEM_SIZE_AVAIL:
2143                 memlist_read_lock();
2144                 for (mlist = phys_avail; mlist; mlist = mlist->ml_next)
2145                         npgs += btop(mlist->ml_size);
2146                 memlist_read_unlock();
2147                 return (npgs);
2148         case LGRP_MEM_SIZE_INSTALL:
2149                 memlist_read_lock();
2150                 for (mlist = phys_install; mlist; mlist = mlist->ml_next)
2151                         npgs += btop(mlist->ml_size);
2152                 memlist_read_unlock();
2153                 return (npgs);
2154         default:
2155                 return ((pgcnt_t)0);
2156         }
2157 }
2158
2159
2160 /*
2161  * Update node to proximity domain mappings for given domain and return node ID
2162  */
2163 static int
2164 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
2165     uint32_t domain)
2166 {
2167         uint_t  node;
2168         uint_t  start;
2169
2170         /*
2171          * Hash proximity domain ID into node to domain mapping table (array)
2172          * and add entry for it into first non-existent or matching entry found
2173          */
2174         node = start = NODE_DOMAIN_HASH(domain, node_cnt);
2175         do {
2176                 /*
2177                  * Entry doesn't exist yet, so create one for this proximity
2178                  * domain and return node ID which is index into mapping table.
2179                  */
2180                 if (!node_domain[node].exists) {
2181                         node_domain[node].prox_domain = domain;
2182                         membar_producer();
2183                         node_domain[node].exists = 1;
2184                         return (node);
2185                 }
2186
2187                 /*
2188                  * Entry exists for this proximity domain already, so just
2189                  * return node ID (index into table).
2190                  */
2191                 if (node_domain[node].prox_domain == domain)
2192                         return (node);
2193                 node = NODE_DOMAIN_HASH(node + 1, node_cnt);
2194         } while (node != start);
2195
2196         /*
2197          * Ran out of supported number of entries which shouldn't happen....
2198          */
2199         ASSERT(node != start);
2200         return (-1);
2201 }
2202
2203 /*
2204  * Update node memory information for given proximity domain with specified
2205  * starting and ending physical address range (and return positive numbers for
2206  * success and negative ones for errors)
2207  */
2208 static int
2209 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt,
2210     memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start,
2211     uint64_t end, uint32_t domain, uint32_t device_id)
2212 {
2213         int     node, mnode;
2214
2215         /*
2216          * Get node number for proximity domain
2217          */
2218         node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
2219         if (node == -1) {
2220                 node = lgrp_plat_node_domain_update(node_domain, node_cnt,
2221                     domain);
2222                 if (node == -1)
2223                         return (-1);
2224         }
2225
2226         /*
2227          * This function is called during boot if device_id is
2228          * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for
2229          * memory DR operations.
2230          */
2231         if (device_id != ACPI_MEMNODE_DEVID_BOOT) {
2232                 ASSERT(lgrp_plat_max_mem_node <= memnode_cnt);
2233
2234                 for (mnode = lgrp_plat_node_cnt;
2235                     mnode < lgrp_plat_max_mem_node; mnode++) {
2236                         if (memnode_info[mnode].exists &&
2237                             memnode_info[mnode].prox_domain == domain &&
2238                             memnode_info[mnode].device_id == device_id) {
2239                                 if (btop(start) < memnode_info[mnode].start)
2240                                         memnode_info[mnode].start = btop(start);
2241                                 if (btop(end) > memnode_info[mnode].end)
2242                                         memnode_info[mnode].end = btop(end);
2243                                 return (1);
2244                         }
2245                 }
2246
2247                 if (lgrp_plat_max_mem_node >= memnode_cnt) {
2248                         return (-3);
2249                 } else {
2250                         lgrp_plat_max_mem_node++;
2251                         memnode_info[mnode].start = btop(start);
2252                         memnode_info[mnode].end = btop(end);
2253                         memnode_info[mnode].prox_domain = domain;
2254                         memnode_info[mnode].device_id = device_id;
2255                         memnode_info[mnode].lgrphand = node;
2256                         membar_producer();
2257                         memnode_info[mnode].exists = 1;
2258                         return (0);
2259                 }
2260         }
2261
2262         /*
2263          * Create entry in table for node if it doesn't exist
2264          */
2265         ASSERT(node < memnode_cnt);
2266         if (!memnode_info[node].exists) {
2267                 memnode_info[node].start = btop(start);
2268                 memnode_info[node].end = btop(end);
2269                 memnode_info[node].prox_domain = domain;
2270                 memnode_info[node].device_id = device_id;
2271                 memnode_info[node].lgrphand = node;
2272                 membar_producer();
2273                 memnode_info[node].exists = 1;
2274                 return (0);
2275         }
2276
2277         /*
2278          * Entry already exists for this proximity domain
2279          *
2280          * There may be more than one SRAT memory entry for a domain, so we may
2281          * need to update existing start or end address for the node.
2282          */
2283         if (memnode_info[node].prox_domain == domain) {
2284                 if (btop(start) < memnode_info[node].start)
2285                         memnode_info[node].start = btop(start);
2286                 if (btop(end) > memnode_info[node].end)
2287                         memnode_info[node].end = btop(end);
2288                 return (1);
2289         }
2290         return (-2);
2291 }
2292
2293
2294 /*
2295  * Have to sort nodes by starting physical address because plat_mnode_xcheck()
2296  * assumes and expects memnodes to be sorted in ascending order by physical
2297  * address.
2298  */
2299 static void
2300 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
2301     cpu_node_map_t *cpu_node, int cpu_count,
2302     memnode_phys_addr_map_t *memnode_info)
2303 {
2304         boolean_t       found;
2305         int             i;
2306         int             j;
2307         int             n;
2308         boolean_t       sorted;
2309         boolean_t       swapped;
2310
2311         if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
2312             node_domain == NULL || memnode_info == NULL)
2313                 return;
2314
2315         /*
2316          * Sorted already?
2317          */
2318         sorted = B_TRUE;
2319         for (i = 0; i < node_cnt - 1; i++) {
2320                 /*
2321                  * Skip entries that don't exist
2322                  */
2323                 if (!memnode_info[i].exists)
2324                         continue;
2325
2326                 /*
2327                  * Try to find next existing entry to compare against
2328                  */
2329                 found = B_FALSE;
2330                 for (j = i + 1; j < node_cnt; j++) {
2331                         if (memnode_info[j].exists) {
2332                                 found = B_TRUE;
2333                                 break;
2334                         }
2335                 }
2336
2337                 /*
2338                  * Done if no more existing entries to compare against
2339                  */
2340                 if (found == B_FALSE)
2341                         break;
2342
2343                 /*
2344                  * Not sorted if starting address of current entry is bigger
2345                  * than starting address of next existing entry
2346                  */
2347                 if (memnode_info[i].start > memnode_info[j].start) {
2348                         sorted = B_FALSE;
2349                         break;
2350                 }
2351         }
2352
2353         /*
2354          * Don't need to sort if sorted already
2355          */
2356         if (sorted == B_TRUE)
2357                 return;
2358
2359         /*
2360          * Just use bubble sort since number of nodes is small
2361          */
2362         n = node_cnt;
2363         do {
2364                 swapped = B_FALSE;
2365                 n--;
2366                 for (i = 0; i < n; i++) {
2367                         /*
2368                          * Skip entries that don't exist
2369                          */
2370                         if (!memnode_info[i].exists)
2371                                 continue;
2372
2373                         /*
2374                          * Try to find next existing entry to compare against
2375                          */
2376                         found = B_FALSE;
2377                         for (j = i + 1; j <= n; j++) {
2378                                 if (memnode_info[j].exists) {
2379                                         found = B_TRUE;
2380                                         break;
2381                                 }
2382                         }
2383
2384                         /*
2385                          * Done if no more existing entries to compare against
2386                          */
2387                         if (found == B_FALSE)
2388                                 break;
2389
2390                         if (memnode_info[i].start > memnode_info[j].start) {
2391                                 memnode_phys_addr_map_t save_addr;
2392                                 node_domain_map_t       save_node;
2393
2394                                 /*
2395                                  * Swap node to proxmity domain ID assignments
2396                                  */
2397                                 bcopy(&node_domain[i], &save_node,
2398                                     sizeof (node_domain_map_t));
2399                                 bcopy(&node_domain[j], &node_domain[i],
2400                                     sizeof (node_domain_map_t));
2401                                 bcopy(&save_node, &node_domain[j],
2402                                     sizeof (node_domain_map_t));
2403
2404                                 /*
2405                                  * Swap node to physical memory assignments
2406                                  */
2407                                 bcopy(&memnode_info[i], &save_addr,
2408                                     sizeof (memnode_phys_addr_map_t));
2409                                 bcopy(&memnode_info[j], &memnode_info[i],
2410                                     sizeof (memnode_phys_addr_map_t));
2411                                 bcopy(&save_addr, &memnode_info[j],
2412                                     sizeof (memnode_phys_addr_map_t));
2413                                 swapped = B_TRUE;
2414                         }
2415                 }
2416         } while (swapped == B_TRUE);
2417
2418         /*
2419          * Check to make sure that CPUs assigned to correct node IDs now since
2420          * node to proximity domain ID assignments may have been changed above
2421          */
2422         if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
2423                 return;
2424         for (i = 0; i < cpu_count; i++) {
2425                 int             node;
2426
2427                 node = lgrp_plat_domain_to_node(node_domain, node_cnt,
2428                     cpu_node[i].prox_domain);
2429                 if (cpu_node[i].node != node)
2430                         cpu_node[i].node = node;
2431         }
2432
2433 }
2434
2435
2436 /*
2437  * Return time needed to probe from current CPU to memory in given node
2438  */
2439 static hrtime_t
2440 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
2441     lgrp_plat_probe_mem_config_t *probe_mem_config,
2442     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2443 {
2444         caddr_t                 buf;
2445         hrtime_t                elapsed;
2446         hrtime_t                end;
2447         int                     from;
2448         int                     i;
2449         int                     ipl;
2450         hrtime_t                max;
2451         hrtime_t                min;
2452         hrtime_t                start;
2453         extern int              use_sse_pagecopy;
2454
2455         /*
2456          * Determine ID of node containing current CPU
2457          */
2458         from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
2459         ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2460
2461         /*
2462          * Do common work for probing main memory
2463          */
2464         if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2465                 /*
2466                  * Skip probing any nodes without memory and
2467                  * set probe time to 0
2468                  */
2469                 if (probe_mem_config->probe_va[to] == NULL) {
2470                         lat_stats->latencies[from][to] = 0;
2471                         return (0);
2472                 }
2473
2474                 /*
2475                  * Invalidate caches once instead of once every sample
2476                  * which should cut cost of probing by a lot
2477                  */
2478                 probe_stats->flush_cost = gethrtime();
2479                 invalidate_cache();
2480                 probe_stats->flush_cost = gethrtime() -
2481                     probe_stats->flush_cost;
2482                 probe_stats->probe_cost_total += probe_stats->flush_cost;
2483         }
2484
2485         /*
2486          * Probe from current CPU to given memory using specified operation
2487          * and take specified number of samples
2488          */
2489         max = 0;
2490         min = -1;
2491         for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2492                 probe_stats->probe_cost = gethrtime();
2493
2494                 /*
2495                  * Can't measure probe time if gethrtime() isn't working yet
2496                  */
2497                 if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2498                         return (0);
2499
2500                 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2501                         /*
2502                          * Measure how long it takes to read vendor ID from
2503                          * Northbridge
2504                          */
2505                         elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2506                 } else {
2507                         /*
2508                          * Measure how long it takes to copy page
2509                          * on top of itself
2510                          */
2511                         buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2512
2513                         kpreempt_disable();
2514                         ipl = splhigh();
2515                         start = gethrtime();
2516                         if (use_sse_pagecopy)
2517                                 hwblkpagecopy(buf, buf);
2518                         else
2519                                 bcopy(buf, buf, PAGESIZE);
2520                         end = gethrtime();
2521                         elapsed = end - start;
2522                         splx(ipl);
2523                         kpreempt_enable();
2524                 }
2525
2526                 probe_stats->probe_cost = gethrtime() -
2527                     probe_stats->probe_cost;
2528                 probe_stats->probe_cost_total += probe_stats->probe_cost;
2529
2530                 if (min == -1 || elapsed < min)
2531                         min = elapsed;
2532                 if (elapsed > max)
2533                         max = elapsed;
2534         }
2535
2536         /*
2537          * Update minimum and maximum probe times between
2538          * these two nodes
2539          */
2540         if (min < probe_stats->probe_min[from][to] ||
2541             probe_stats->probe_min[from][to] == 0)
2542                 probe_stats->probe_min[from][to] = min;
2543
2544         if (max > probe_stats->probe_max[from][to])
2545                 probe_stats->probe_max[from][to] = max;
2546
2547         return (min);
2548 }
2549
2550
2551 /*
2552  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
2553  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
2554  * and return number of CPU APIC IDs.
2555  *
2556  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
2557  *       in in cpu_apicid_array boot property which is based on and follows
2558  *       same ordering as processor list in ACPI MADT.  If the code in
2559  *       usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
2560  *       CPU IDs ever changes, then this code will need to change too....
2561  */
2562 static int
2563 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
2564 {
2565         int     boot_prop_len;
2566         char    *boot_prop_name = BP_CPU_APICID_ARRAY;
2567         uint32_t *cpu_apicid_array;
2568         int     i;
2569         int     n;
2570
2571         /*
2572          * Check length of property value
2573          */
2574         boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
2575         if (boot_prop_len <= 0)
2576                 return (-1);
2577
2578         /*
2579          * Calculate number of entries in array and return when the system is
2580          * not very interesting for NUMA. It's not interesting for NUMA if
2581          * system has only one CPU and doesn't support CPU hotplug.
2582          */
2583         n = boot_prop_len / sizeof (*cpu_apicid_array);
2584         if (n == 1 && !plat_dr_support_cpu())
2585                 return (-2);
2586
2587         cpu_apicid_array = (uint32_t *)BOP_ALLOC(bootops, NULL, boot_prop_len,
2588             sizeof (*cpu_apicid_array));
2589         /*
2590          * Get CPU to APIC ID property value
2591          */
2592         if (cpu_apicid_array == NULL ||
2593             BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
2594                 return (-3);
2595
2596         /*
2597          * Just return number of CPU APIC IDs if CPU to node mapping table is
2598          * NULL
2599          */
2600         if (cpu_node == NULL) {
2601                 if (plat_dr_support_cpu() && n >= boot_ncpus) {
2602                         return (boot_ncpus);
2603                 } else {
2604                         return (n);
2605                 }
2606         }
2607
2608         /*
2609          * Fill in CPU to node ID mapping table with APIC ID for each CPU
2610          */
2611         for (i = 0; i < n; i++) {
2612                 /* Only add boot CPUs into the map if CPU DR is enabled. */
2613                 if (plat_dr_support_cpu() && i >= boot_ncpus)
2614                         break;
2615                 cpu_node[i].exists = 1;
2616                 cpu_node[i].apicid = cpu_apicid_array[i];
2617                 cpu_node[i].prox_domain = UINT32_MAX;
2618                 cpu_node[i].node = UINT_MAX;
2619         }
2620
2621         /*
2622          * Return number of CPUs based on number of APIC IDs
2623          */
2624         return (i);
2625 }
2626
2627
2628 /*
2629  * Read ACPI System Locality Information Table (SLIT) to determine how far each
2630  * NUMA node is from each other
2631  */
2632 static int
2633 lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
2634     node_domain_map_t *node_domain, uint_t node_cnt,
2635     memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats)
2636 {
2637         int             i;
2638         int             j;
2639         int             src;
2640         int             dst;
2641         int             localities;
2642         hrtime_t        max;
2643         hrtime_t        min;
2644         int             retval;
2645         uint8_t         *slit_entries;
2646
2647         if (tp == NULL || !lgrp_plat_slit_enable)
2648                 return (1);
2649
2650         if (lat_stats == NULL)
2651                 return (2);
2652
2653         localities = tp->LocalityCount;
2654
2655         min = lat_stats->latency_min;
2656         max = lat_stats->latency_max;
2657
2658         /*
2659          * Fill in latency matrix based on SLIT entries
2660          */
2661         slit_entries = tp->Entry;
2662         for (i = 0; i < localities; i++) {
2663                 src = lgrp_plat_domain_to_node(node_domain,
2664                     node_cnt, i);
2665                 if (src == -1)
2666                         continue;
2667
2668                 for (j = 0; j < localities; j++) {
2669                         uint8_t latency;
2670
2671                         dst = lgrp_plat_domain_to_node(node_domain,
2672                             node_cnt, j);
2673                         if (dst == -1)
2674                                 continue;
2675
2676                         latency = slit_entries[(i * localities) + j];
2677                         lat_stats->latencies[src][dst] = latency;
2678                         if (latency < min || min == -1)
2679                                 min = latency;
2680                         if (latency > max)
2681                                 max = latency;
2682                 }
2683         }
2684
2685         /*
2686          * Verify that latencies/distances given in SLIT look reasonable
2687          */
2688         retval = lgrp_plat_latency_verify(memnode_info, lat_stats);
2689
2690         if (retval) {
2691                 /*
2692                  * Reinitialize (zero) latency table since SLIT doesn't look
2693                  * right
2694                  */
2695                 for (i = 0; i < localities; i++) {
2696                         for (j = 0; j < localities; j++)
2697                                 lat_stats->latencies[i][j] = 0;
2698                 }
2699         } else {
2700                 /*
2701                  * Update min and max latencies seen since SLIT looks valid
2702                  */
2703                 lat_stats->latency_min = min;
2704                 lat_stats->latency_max = max;
2705         }
2706
2707         return (retval);
2708 }
2709
2710
2711 /*
2712  * Update lgrp latencies according to information returned by ACPI _SLI method.
2713  */
2714 static int
2715 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
2716     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
2717     lgrp_plat_latency_stats_t *lat_stats)
2718 {
2719         int             i;
2720         int             src, dst;
2721         uint8_t         latency;
2722         hrtime_t        max, min;
2723
2724         if (lat_stats == NULL || sli_info == NULL ||
2725             sli_cnt == 0 || domain_id >= sli_cnt)
2726                 return (-1);
2727
2728         src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id);
2729         if (src == -1) {
2730                 src = lgrp_plat_node_domain_update(node_domain, node_cnt,
2731                     domain_id);
2732                 if (src == -1)
2733                         return (-1);
2734         }
2735
2736         /*
2737          * Don't update latency info if topology has been flattened to 2 levels.
2738          */
2739         if (lgrp_plat_topo_flatten != 0) {
2740                 return (0);
2741         }
2742
2743         /*
2744          * Latency information for proximity domain is ready.
2745          * TODO: support adjusting latency information at runtime.
2746          */
2747         if (lat_stats->latencies[src][src] != 0) {
2748                 return (0);
2749         }
2750
2751         /* Validate latency information. */
2752         for (i = 0; i < sli_cnt; i++) {
2753                 if (i == domain_id) {
2754                         if (sli_info[i] != ACPI_SLIT_SELF_LATENCY ||
2755                             sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) {
2756                                 return (-1);
2757                         }
2758                 } else {
2759                         if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY ||
2760                             sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY ||
2761                             sli_info[i] != sli_info[sli_cnt + i]) {
2762                                 return (-1);
2763                         }
2764                 }
2765         }
2766
2767         min = lat_stats->latency_min;
2768         max = lat_stats->latency_max;
2769         for (i = 0; i < sli_cnt; i++) {
2770                 dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i);
2771                 if (dst == -1)
2772                         continue;
2773
2774                 ASSERT(sli_info[i] == sli_info[sli_cnt + i]);
2775
2776                 /* Update row in latencies matrix. */
2777                 latency = sli_info[i];
2778                 lat_stats->latencies[src][dst] = latency;
2779                 if (latency < min || min == -1)
2780                         min = latency;
2781                 if (latency > max)
2782                         max = latency;
2783
2784                 /* Update column in latencies matrix. */
2785                 latency = sli_info[sli_cnt + i];
2786                 lat_stats->latencies[dst][src] = latency;
2787                 if (latency < min || min == -1)
2788                         min = latency;
2789                 if (latency > max)
2790                         max = latency;
2791         }
2792         lat_stats->latency_min = min;
2793         lat_stats->latency_max = max;
2794
2795         return (0);
2796 }
2797
2798
2799 /*
2800  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2801  * and memory are local to each other in the same NUMA node and return number
2802  * of nodes
2803  */
2804 static int
2805 lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
2806     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
2807     cpu_node_map_t *cpu_node, int cpu_count,
2808     memnode_phys_addr_map_t *memnode_info)
2809 {
2810         ACPI_SUBTABLE_HEADER    *item, *srat_end;
2811         int                     i;
2812         int                     node_cnt;
2813         int                     proc_entry_count;
2814         int                     rc;
2815
2816         /*
2817          * Nothing to do when no SRAT or disabled
2818          */
2819         if (tp == NULL || !lgrp_plat_srat_enable)
2820                 return (-1);
2821
2822         /*
2823          * Try to get domain information from MSCT table.
2824          * ACPI4.0: OSPM will use information provided by the MSCT only
2825          * when the System Resource Affinity Table (SRAT) exists.
2826          */
2827         node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min);
2828         if (node_cnt <= 0) {
2829                 /*
2830                  * Determine number of nodes by counting number of proximity
2831                  * domains in SRAT.
2832                  */
2833                 node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
2834         }
2835         /*
2836          * Return if number of nodes is 1 or less since don't need to read SRAT.
2837          */
2838         if (node_cnt == 1)
2839                 return (1);
2840         else if (node_cnt <= 0)
2841                 return (-2);
2842
2843         /*
2844          * Walk through SRAT, examining each CPU and memory entry to determine
2845          * which CPUs and memory belong to which node.
2846          */
2847         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
2848         srat_end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
2849         proc_entry_count = 0;
2850         while (item < srat_end) {
2851                 uint32_t        apic_id;
2852                 uint32_t        domain;
2853                 uint64_t        end;
2854                 uint64_t        length;
2855                 uint64_t        start;
2856
2857                 switch (item->Type) {
2858                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
2859                         ACPI_SRAT_CPU_AFFINITY *cpu =
2860                             (ACPI_SRAT_CPU_AFFINITY *) item;
2861
2862                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2863                             cpu_node == NULL)
2864                                 break;
2865
2866                         /*
2867                          * Calculate domain (node) ID and fill in APIC ID to
2868                          * domain/node mapping table
2869                          */
2870                         domain = cpu->ProximityDomainLo;
2871                         for (i = 0; i < 3; i++) {
2872                                 domain += cpu->ProximityDomainHi[i] <<
2873                                     ((i + 1) * 8);
2874                         }
2875                         apic_id = cpu->ApicId;
2876
2877                         rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2878                             cpu_node, cpu_count, apic_id, domain);
2879                         if (rc < 0)
2880                                 return (-3);
2881                         else if (rc == 0)
2882                                 proc_entry_count++;
2883                         break;
2884                 }
2885                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
2886                         ACPI_SRAT_MEM_AFFINITY *mem =
2887                             (ACPI_SRAT_MEM_AFFINITY *)item;
2888
2889                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED) ||
2890                             memnode_info == NULL)
2891                                 break;
2892
2893                         /*
2894                          * Get domain (node) ID and fill in domain/node
2895                          * to memory mapping table
2896                          */
2897                         domain = mem->ProximityDomain;
2898                         start = mem->BaseAddress;
2899                         length = mem->Length;
2900                         end = start + length - 1;
2901
2902                         /*
2903                          * According to ACPI 4.0, both ENABLE and HOTPLUG flags
2904                          * may be set for memory address range entries in SRAT
2905                          * table which are reserved for memory hot plug.
2906                          * We intersect memory address ranges in SRAT table
2907                          * with memory ranges in physinstalled to filter out
2908                          * memory address ranges reserved for hot plug.
2909                          */
2910                         if (mem->Flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
2911                                 uint64_t        rstart = UINT64_MAX;
2912                                 uint64_t        rend = 0;
2913                                 struct memlist  *ml;
2914                                 extern struct bootops   *bootops;
2915
2916                                 memlist_read_lock();
2917                                 for (ml = bootops->boot_mem->physinstalled;
2918                                     ml; ml = ml->ml_next) {
2919                                         uint64_t tstart = ml->ml_address;
2920                                         uint64_t tend;
2921
2922                                         tend = ml->ml_address + ml->ml_size;
2923                                         if (tstart > end || tend < start)
2924                                                 continue;
2925                                         if (start > tstart)
2926                                                 tstart = start;
2927                                         if (rstart > tstart)
2928                                                 rstart = tstart;
2929                                         if (end < tend)
2930                                                 tend = end;
2931                                         if (rend < tend)
2932                                                 rend = tend;
2933                                 }
2934                                 memlist_read_unlock();
2935                                 start = rstart;
2936                                 end = rend;
2937                                 /* Skip this entry if no memory installed. */
2938                                 if (start > end)
2939                                         break;
2940                         }
2941
2942                         if (lgrp_plat_memnode_info_update(node_domain,
2943                             node_cnt, memnode_info, node_cnt,
2944                             start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0)
2945                                 return (-4);
2946                         break;
2947                 }
2948                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
2949                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
2950                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
2951
2952                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2953                             cpu_node == NULL)
2954                                 break;
2955
2956                         /*
2957                          * Calculate domain (node) ID and fill in APIC ID to
2958                          * domain/node mapping table
2959                          */
2960                         domain = x2cpu->ProximityDomain;
2961                         apic_id = x2cpu->ApicId;
2962
2963                         rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2964                             cpu_node, cpu_count, apic_id, domain);
2965                         if (rc < 0)
2966                                 return (-3);
2967                         else if (rc == 0)
2968                                 proc_entry_count++;
2969                         break;
2970                 }
2971                 default:
2972                         break;
2973                 }
2974
2975                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
2976         }
2977
2978         /*
2979          * Should have seen at least as many SRAT processor entries as CPUs
2980          */
2981         if (proc_entry_count < cpu_count)
2982                 return (-5);
2983
2984         /*
2985          * Need to sort nodes by starting physical address since VM system
2986          * assumes and expects memnodes to be sorted in ascending order by
2987          * physical address
2988          */
2989         lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
2990             memnode_info);
2991
2992         return (node_cnt);
2993 }
2994
2995
2996 /*
2997  * Allocate permanent memory for any temporary memory that we needed to
2998  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
2999  * initialized and copy everything from temporary to permanent memory since
3000  * temporary boot memory will eventually be released during boot
3001  */
3002 static void
3003 lgrp_plat_release_bootstrap(void)
3004 {
3005         void    *buf;
3006         size_t  size;
3007
3008         if (lgrp_plat_cpu_node_nentries > 0) {
3009                 size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
3010                 buf = kmem_alloc(size, KM_SLEEP);
3011                 bcopy(lgrp_plat_cpu_node, buf, size);
3012                 lgrp_plat_cpu_node = buf;
3013         }
3014 }
3015
3016
3017 /*
3018  * Return number of proximity domains given in ACPI SRAT
3019  */
3020 static int
3021 lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp, uint32_t *prox_domain_min)
3022 {
3023         int                     domain_cnt;
3024         uint32_t                domain_min;
3025         ACPI_SUBTABLE_HEADER    *item, *end;
3026         int                     i;
3027         node_domain_map_t       node_domain[MAX_NODES];
3028
3029
3030         if (tp == NULL || !lgrp_plat_srat_enable)
3031                 return (1);
3032
3033         /*
3034          * Walk through SRAT to find minimum proximity domain ID
3035          */
3036         domain_min = UINT32_MAX;
3037         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3038         end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3039         while (item < end) {
3040                 uint32_t        domain;
3041
3042                 switch (item->Type) {
3043                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
3044                         ACPI_SRAT_CPU_AFFINITY *cpu =
3045                             (ACPI_SRAT_CPU_AFFINITY *) item;
3046
3047                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3048                                 item = (ACPI_SUBTABLE_HEADER *)
3049                                     ((uintptr_t)item + item->Length);
3050                                 continue;
3051                         }
3052                         domain = cpu->ProximityDomainLo;
3053                         for (i = 0; i < 3; i++) {
3054                                 domain += cpu->ProximityDomainHi[i] <<
3055                                     ((i + 1) * 8);
3056                         }
3057                         break;
3058                 }
3059                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
3060                         ACPI_SRAT_MEM_AFFINITY *mem =
3061                             (ACPI_SRAT_MEM_AFFINITY *)item;
3062
3063                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3064                                 item = (ACPI_SUBTABLE_HEADER *)
3065                                     ((uintptr_t)item + item->Length);
3066                                 continue;
3067                         }
3068                         domain = mem->ProximityDomain;
3069                         break;
3070                 }
3071                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
3072                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3073                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3074
3075                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3076                                 item = (ACPI_SUBTABLE_HEADER *)
3077                                     ((uintptr_t)item + item->Length);
3078                                 continue;
3079                         }
3080                         domain = x2cpu->ProximityDomain;
3081                         break;
3082                 }
3083                 default:
3084                         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3085                             item->Length);
3086                         continue;
3087                 }
3088
3089                 /*
3090                  * Keep track of minimum proximity domain ID
3091                  */
3092                 if (domain < domain_min)
3093                         domain_min = domain;
3094
3095                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3096         }
3097         if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
3098                 *prox_domain_min = domain_min;
3099
3100         /*
3101          * Walk through SRAT, examining each CPU and memory entry to determine
3102          * proximity domain ID for each.
3103          */
3104         domain_cnt = 0;
3105         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3106         end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3107         bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
3108         while (item < end) {
3109                 uint32_t        domain;
3110                 boolean_t       overflow;
3111                 uint_t          start;
3112
3113                 switch (item->Type) {
3114                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
3115                         ACPI_SRAT_CPU_AFFINITY *cpu =
3116                             (ACPI_SRAT_CPU_AFFINITY *) item;
3117
3118                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3119                                 item = (ACPI_SUBTABLE_HEADER *)
3120                                     ((uintptr_t)item + item->Length);
3121                                 continue;
3122                         }
3123                         domain = cpu->ProximityDomainLo;
3124                         for (i = 0; i < 3; i++) {
3125                                 domain += cpu->ProximityDomainHi[i] <<
3126                                     ((i + 1) * 8);
3127                         }
3128                         break;
3129                 }
3130                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
3131                         ACPI_SRAT_MEM_AFFINITY *mem =
3132                             (ACPI_SRAT_MEM_AFFINITY *)item;
3133
3134                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3135                                 item = (ACPI_SUBTABLE_HEADER *)
3136                                     ((uintptr_t)item + item->Length);
3137                                 continue;
3138                         }
3139                         domain = mem->ProximityDomain;
3140                         break;
3141                 }
3142                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
3143                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3144                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3145
3146                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3147                                 item = (ACPI_SUBTABLE_HEADER *)
3148                                     ((uintptr_t)item + item->Length);
3149                                 continue;
3150                         }
3151                         domain = x2cpu->ProximityDomain;
3152                         break;
3153                 }
3154                 default:
3155                         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3156                             item->Length);
3157                         continue;
3158                 }
3159
3160                 /*
3161                  * Count and keep track of which proximity domain IDs seen
3162                  */
3163                 start = i = domain % MAX_NODES;
3164                 overflow = B_TRUE;
3165                 do {
3166                         /*
3167                          * Create entry for proximity domain and increment
3168                          * count when no entry exists where proximity domain
3169                          * hashed
3170                          */
3171                         if (!node_domain[i].exists) {
3172                                 node_domain[i].exists = 1;
3173                                 node_domain[i].prox_domain = domain;
3174                                 domain_cnt++;
3175                                 overflow = B_FALSE;
3176                                 break;
3177                         }
3178
3179                         /*
3180                          * Nothing to do when proximity domain seen already
3181                          * and its entry exists
3182                          */
3183                         if (node_domain[i].prox_domain == domain) {
3184                                 overflow = B_FALSE;
3185                                 break;
3186                         }
3187
3188                         /*
3189                          * Entry exists where proximity domain hashed, but for
3190                          * different proximity domain so keep search for empty
3191                          * slot to put it or matching entry whichever comes
3192                          * first.
3193                          */
3194                         i = (i + 1) % MAX_NODES;
3195                 } while (i != start);
3196
3197                 /*
3198                  * Didn't find empty or matching entry which means have more
3199                  * proximity domains than supported nodes (:-(
3200                  */
3201                 ASSERT(overflow != B_TRUE);
3202                 if (overflow == B_TRUE)
3203                         return (-1);
3204
3205                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3206         }
3207         return (domain_cnt);
3208 }
3209
3210
3211 /*
3212  * Parse domain information in ACPI Maximum System Capability Table (MSCT).
3213  * MSCT table has been verified in function process_msct() in fakebop.c.
3214  */
3215 static int
3216 lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp, uint32_t *prox_domain_min)
3217 {
3218         int last_seen = 0;
3219         uint32_t proxmin = UINT32_MAX;
3220         ACPI_MSCT_PROXIMITY *item, *end;
3221
3222         if (tp == NULL || lgrp_plat_msct_enable == 0)
3223                 return (-1);
3224
3225         if (tp->MaxProximityDomains >= MAX_NODES) {
3226                 cmn_err(CE_CONT,
3227                     "?lgrp: too many proximity domains (%d), max %d supported, "
3228                     "disable support of CPU/memory DR operations.",
3229                     tp->MaxProximityDomains + 1, MAX_NODES);
3230                 plat_dr_disable_cpu();
3231                 plat_dr_disable_memory();
3232                 return (-1);
3233         }
3234
3235         if (prox_domain_min != NULL) {
3236                 end = (void *)(tp->Header.Length + (uintptr_t)tp);
3237                 for (item = (void *)((uintptr_t)tp +
3238                     tp->ProximityOffset); item < end;
3239                     item = (void *)(item->Length + (uintptr_t)item)) {
3240                         if (item->RangeStart < proxmin) {
3241                                 proxmin = item->RangeStart;
3242                         }
3243
3244                         last_seen = item->RangeEnd - item->RangeStart + 1;
3245                         /*
3246                          * Break out if all proximity domains have been
3247                          * processed. Some BIOSes may have unused items
3248                          * at the end of MSCT table.
3249                          */
3250                         if (last_seen > tp->MaxProximityDomains) {
3251                                 break;
3252                         }
3253                 }
3254                 *prox_domain_min = proxmin;
3255         }
3256
3257         return (tp->MaxProximityDomains + 1);
3258 }
3259
3260
3261 /*
3262  * Set lgroup latencies for 2 level lgroup topology
3263  */
3264 static void
3265 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats)
3266 {
3267         int     i, j;
3268
3269         ASSERT(lat_stats != NULL);
3270
3271         if (lgrp_plat_node_cnt >= 4)
3272                 cmn_err(CE_NOTE,
3273                     "MPO only optimizing for local and remote\n");
3274         for (i = 0; i < lgrp_plat_node_cnt; i++) {
3275                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
3276                         if (i == j)
3277                                 lat_stats->latencies[i][j] = 2;
3278                         else
3279                                 lat_stats->latencies[i][j] = 3;
3280                 }
3281         }
3282         lat_stats->latency_min = 2;
3283         lat_stats->latency_max = 3;
3284         /* TODO: check it. */
3285         lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
3286         lgrp_plat_topo_flatten = 1;
3287 }
3288
3289
3290 /*
3291  * The following Opteron specific constants, macros, types, and routines define
3292  * PCI configuration space registers and how to read them to determine the NUMA
3293  * configuration of *supported* Opteron processors.  They provide the same
3294  * information that may be gotten from the ACPI System Resource Affinity Table
3295  * (SRAT) if it exists on the machine of interest.
3296  *
3297  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
3298  * of interest describes all of these registers and their contents.  The main
3299  * registers used by this code to determine the NUMA configuration of the
3300  * machine are the node ID register for the number of NUMA nodes and the DRAM
3301  * address map registers for the physical address range of each node.
3302  *
3303  * NOTE: The format and how to determine the NUMA configuration using PCI
3304  *       config space registers may change or may not be supported in future
3305  *       Opteron processor families.
3306  */
3307
3308 /*
3309  * How many bits to shift Opteron DRAM Address Map base and limit registers
3310  * to get actual value
3311  */
3312 #define OPT_DRAMADDR_HI_LSHIFT_ADDR     40      /* shift left for address */
3313 #define OPT_DRAMADDR_LO_LSHIFT_ADDR     8       /* shift left for address */
3314
3315 #define OPT_DRAMADDR_HI_MASK_ADDR       0x000000FF /* address bits 47-40 */
3316 #define OPT_DRAMADDR_LO_MASK_ADDR       0xFFFF0000 /* address bits 39-24 */
3317
3318 #define OPT_DRAMADDR_LO_MASK_OFF        0xFFFFFF /* offset for address */
3319
3320 /*
3321  * Macros to derive addresses from Opteron DRAM Address Map registers
3322  */
3323 #define OPT_DRAMADDR_HI(reg) \
3324         (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
3325             OPT_DRAMADDR_HI_LSHIFT_ADDR)
3326
3327 #define OPT_DRAMADDR_LO(reg) \
3328         (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
3329             OPT_DRAMADDR_LO_LSHIFT_ADDR)
3330
3331 #define OPT_DRAMADDR(high, low) \
3332         (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
3333
3334 /*
3335  * Bit masks defining what's in Opteron DRAM Address Map base register
3336  */
3337 #define OPT_DRAMBASE_LO_MASK_RE         0x1     /* read enable */
3338 #define OPT_DRAMBASE_LO_MASK_WE         0x2     /* write enable */
3339 #define OPT_DRAMBASE_LO_MASK_INTRLVEN   0x700   /* interleave */
3340
3341 /*
3342  * Bit masks defining what's in Opteron DRAM Address Map limit register
3343  */
3344 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE   0x7             /* destination node */
3345 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700           /* interleave select */
3346
3347
3348 /*
3349  * Opteron Node ID register in PCI configuration space contains
3350  * number of nodes in system, etc. for Opteron K8.  The following
3351  * constants and macros define its contents, structure, and access.
3352  */
3353
3354 /*
3355  * Bit masks defining what's in Opteron Node ID register
3356  */
3357 #define OPT_NODE_MASK_ID        0x7     /* node ID */
3358 #define OPT_NODE_MASK_CNT       0x70    /* node count */
3359 #define OPT_NODE_MASK_IONODE    0x700   /* Hypertransport I/O hub node ID */
3360 #define OPT_NODE_MASK_LCKNODE   0x7000  /* lock controller node ID */
3361 #define OPT_NODE_MASK_CPUCNT    0xF0000 /* CPUs in system (0 means 1 CPU)  */
3362
3363 /*
3364  * How many bits in Opteron Node ID register to shift right to get actual value
3365  */
3366 #define OPT_NODE_RSHIFT_CNT     0x4     /* shift right for node count value */
3367
3368 /*
3369  * Macros to get values from Opteron Node ID register
3370  */
3371 #define OPT_NODE_CNT(reg) \
3372         ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
3373
3374 /*
3375  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
3376  * "in/out" instructions
3377  *
3378  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
3379  *       other uses should just do MMIO to access PCI ECS.
3380  *       Must enable special bit in Northbridge Configuration Register on
3381  *       Greyhound for extended CF8 space access to be able to access PCI ECS
3382  *       using "in/out" instructions and restore special bit after done
3383  *       accessing PCI ECS.
3384  */
3385 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \
3386         (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
3387             (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
3388             ((((reg) >> 8) & 0xf) << 24))
3389
3390 /*
3391  * PCI configuration space registers accessed by specifying
3392  * a bus, device, function, and offset.  The following constants
3393  * define the values needed to access Opteron K8 configuration
3394  * info to determine its node topology
3395  */
3396
3397 #define OPT_PCS_BUS_CONFIG      0       /* Hypertransport config space bus */
3398
3399 /*
3400  * Opteron PCI configuration space register function values
3401  */
3402 #define OPT_PCS_FUNC_HT         0       /* Hypertransport configuration */
3403 #define OPT_PCS_FUNC_ADDRMAP    1       /* Address map configuration */
3404 #define OPT_PCS_FUNC_DRAM       2       /* DRAM configuration */
3405 #define OPT_PCS_FUNC_MISC       3       /* Miscellaneous configuration */
3406
3407 /*
3408  * PCI Configuration Space register offsets
3409  */
3410 #define OPT_PCS_OFF_VENDOR      0x0     /* device/vendor ID register */
3411 #define OPT_PCS_OFF_DRAMBASE_HI 0x140   /* DRAM Base register (node 0) */
3412 #define OPT_PCS_OFF_DRAMBASE_LO 0x40    /* DRAM Base register (node 0) */
3413 #define OPT_PCS_OFF_NODEID      0x60    /* Node ID register */
3414
3415 /*
3416  * Opteron PCI Configuration Space device IDs for nodes
3417  */
3418 #define OPT_PCS_DEV_NODE0               24      /* device number for node 0 */
3419
3420
3421 /*
3422  * Opteron DRAM address map gives base and limit for physical memory in a node
3423  */
3424 typedef struct opt_dram_addr_map {
3425         uint32_t        base_hi;
3426         uint32_t        base_lo;
3427         uint32_t        limit_hi;
3428         uint32_t        limit_lo;
3429 } opt_dram_addr_map_t;
3430
3431
3432 /*
3433  * Supported AMD processor families
3434  */
3435 #define AMD_FAMILY_HAMMER       15
3436 #define AMD_FAMILY_GREYHOUND    16
3437
3438 /*
3439  * Whether to have is_opteron() return 1 even when processor isn't supported
3440  */
3441 uint_t  is_opteron_override = 0;
3442
3443 /*
3444  * AMD processor family for current CPU
3445  */
3446 uint_t  opt_family = 0;
3447
3448
3449 /*
3450  * Determine whether we're running on a supported AMD Opteron since reading
3451  * node count and DRAM address map registers may have different format or
3452  * may not be supported across processor families
3453  */
3454 static int
3455 is_opteron(void)
3456 {
3457
3458         if (x86_vendor != X86_VENDOR_AMD)
3459                 return (0);
3460
3461         opt_family = cpuid_getfamily(CPU);
3462         if (opt_family == AMD_FAMILY_HAMMER ||
3463             opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
3464                 return (1);
3465         else
3466                 return (0);
3467 }
3468
3469
3470 /*
3471  * Determine NUMA configuration for Opteron from registers that live in PCI
3472  * configuration space
3473  */
3474 static void
3475 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
3476     memnode_phys_addr_map_t *memnode_info)
3477 {
3478         uint_t                          bus;
3479         uint_t                          dev;
3480         struct opt_dram_addr_map        dram_map[MAX_NODES];
3481         uint_t                          node;
3482         uint_t                          node_info[MAX_NODES];
3483         uint_t                          off_hi;
3484         uint_t                          off_lo;
3485         uint64_t                        nb_cfg_reg;
3486
3487         /*
3488          * Read configuration registers from PCI configuration space to
3489          * determine node information, which memory is in each node, etc.
3490          *
3491          * Write to PCI configuration space address register to specify
3492          * which configuration register to read and read/write PCI
3493          * configuration space data register to get/set contents
3494          */
3495         bus = OPT_PCS_BUS_CONFIG;
3496         dev = OPT_PCS_DEV_NODE0;
3497         off_hi = OPT_PCS_OFF_DRAMBASE_HI;
3498         off_lo = OPT_PCS_OFF_DRAMBASE_LO;
3499
3500         /*
3501          * Read node ID register for node 0 to get node count
3502          */
3503         node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
3504             OPT_PCS_OFF_NODEID);
3505         *node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
3506
3507         /*
3508          * If number of nodes is more than maximum supported, then set node
3509          * count to 1 and treat system as UMA instead of NUMA.
3510          */
3511         if (*node_cnt > MAX_NODES) {
3512                 *node_cnt = 1;
3513                 return;
3514         }
3515
3516         /*
3517          * For Greyhound, PCI Extended Configuration Space must be enabled to
3518          * read high DRAM address map base and limit registers
3519          */
3520         if (opt_family == AMD_FAMILY_GREYHOUND) {
3521                 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
3522                 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3523                         wrmsr(MSR_AMD_NB_CFG,
3524                             nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
3525         }
3526
3527         for (node = 0; node < *node_cnt; node++) {
3528                 uint32_t        base_hi;
3529                 uint32_t        base_lo;
3530                 uint32_t        limit_hi;
3531                 uint32_t        limit_lo;
3532
3533                 /*
3534                  * Read node ID register (except for node 0 which we just read)
3535                  */
3536                 if (node > 0) {
3537                         node_info[node] = pci_getl_func(bus, dev,
3538                             OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
3539                 }
3540
3541                 /*
3542                  * Read DRAM base and limit registers which specify
3543                  * physical memory range of each node
3544                  */
3545                 if (opt_family != AMD_FAMILY_GREYHOUND)
3546                         base_hi = 0;
3547                 else {
3548                         outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3549                             OPT_PCS_FUNC_ADDRMAP, off_hi));
3550                         base_hi = dram_map[node].base_hi =
3551                             inl(PCI_CONFDATA);
3552                 }
3553                 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
3554                     OPT_PCS_FUNC_ADDRMAP, off_lo);
3555
3556                 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
3557                     mem_intrlv)
3558                         *mem_intrlv = *mem_intrlv + 1;
3559
3560                 off_hi += 4;    /* high limit register offset */
3561                 if (opt_family != AMD_FAMILY_GREYHOUND)
3562                         limit_hi = 0;
3563                 else {
3564                         outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3565                             OPT_PCS_FUNC_ADDRMAP, off_hi));
3566                         limit_hi = dram_map[node].limit_hi =
3567                             inl(PCI_CONFDATA);
3568                 }
3569
3570                 off_lo += 4;    /* low limit register offset */
3571                 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
3572                     dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
3573
3574                 /*
3575                  * Increment device number to next node and register offsets
3576                  * for DRAM base register of next node
3577                  */
3578                 off_hi += 4;
3579                 off_lo += 4;
3580                 dev++;
3581
3582                 /*
3583                  * Both read and write enable bits must be enabled in DRAM
3584                  * address map base register for physical memory to exist in
3585                  * node
3586                  */
3587                 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
3588                     (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
3589                         /*
3590                          * Mark node memory as non-existent and set start and
3591                          * end addresses to be same in memnode_info[]
3592                          */
3593                         memnode_info[node].exists = 0;
3594                         memnode_info[node].start = memnode_info[node].end =
3595                             (pfn_t)-1;
3596                         continue;
3597                 }
3598
3599                 /*
3600                  * Mark node memory as existing and remember physical address
3601                  * range of each node for use later
3602                  */
3603                 memnode_info[node].exists = 1;
3604
3605                 memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
3606
3607                 memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
3608                     OPT_DRAMADDR_LO_MASK_OFF);
3609         }
3610
3611         /*
3612          * Restore PCI Extended Configuration Space enable bit
3613          */
3614         if (opt_family == AMD_FAMILY_GREYHOUND) {
3615                 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3616                         wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
3617         }
3618 }
3619
3620
3621 /*
3622  * Return average amount of time to read vendor ID register on Northbridge
3623  * N times on specified destination node from current CPU
3624  */
3625 static hrtime_t
3626 opt_probe_vendor(int dest_node, int nreads)
3627 {
3628         int             cnt;
3629         uint_t          dev;
3630         /* LINTED: set but not used in function */
3631         volatile uint_t dev_vendor;
3632         hrtime_t        elapsed;
3633         hrtime_t        end;
3634         int             ipl;
3635         hrtime_t        start;
3636
3637         dev = OPT_PCS_DEV_NODE0 + dest_node;
3638         kpreempt_disable();
3639         ipl = spl8();
3640         outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
3641             OPT_PCS_OFF_VENDOR));
3642         start = gethrtime();
3643         for (cnt = 0; cnt < nreads; cnt++)
3644                 dev_vendor = inl(PCI_CONFDATA);
3645         end = gethrtime();
3646         elapsed = (end - start) / nreads;
3647         splx(ipl);
3648         kpreempt_enable();
3649         return (elapsed);
3650 }