arch/powerpc/platforms/pseries/ras.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2001 Dave Engebretsen IBM Corporation
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/interrupt.h>
   8 #include <linux/irq.h>
   9 #include <linux/of.h>
  10 #include <linux/fs.h>
  11 #include <linux/reboot.h>
  12 #include <linux/irq_work.h>
  13
  14 #include <asm/machdep.h>
  15 #include <asm/rtas.h>
  16 #include <asm/firmware.h>
  17 #include <asm/mce.h>
  18
  19 #include "pseries.h"
  20
  21 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
  22 static DEFINE_SPINLOCK(ras_log_buf_lock);
  23
  24 static int ras_check_exception_token;
  25
  26 static void mce_process_errlog_event(struct irq_work *work);
  27 static struct irq_work mce_errlog_process_work = {
  28         .func = mce_process_errlog_event,
  29 };
  30
  31 #define EPOW_SENSOR_TOKEN       9
  32 #define EPOW_SENSOR_INDEX       0
  33
  34 /* EPOW events counter variable */
  35 static int num_epow_events;
  36
  37 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id);
  38 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
  39 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
  40
  41 /* RTAS pseries MCE errorlog section. */
  42 struct pseries_mc_errorlog {
  43         __be32  fru_id;
  44         __be32  proc_id;
  45         u8      error_type;
  46         /*
  47          * sub_err_type (1 byte). Bit fields depends on error_type
  48          *
  49          *   MSB0
  50          *   |
  51          *   V
  52          *   01234567
  53          *   XXXXXXXX
  54          *
  55          * For error_type == MC_ERROR_TYPE_UE
  56          *   XXXXXXXX
  57          *   X          1: Permanent or Transient UE.
  58          *    X         1: Effective address provided.
  59          *     X        1: Logical address provided.
  60          *      XX      2: Reserved.
  61          *        XXX   3: Type of UE error.
  62          *
  63          * For error_type != MC_ERROR_TYPE_UE
  64          *   XXXXXXXX
  65          *   X          1: Effective address provided.
  66          *    XXXXX     5: Reserved.
  67          *         XX   2: Type of SLB/ERAT/TLB error.
  68          */
  69         u8      sub_err_type;
  70         u8      reserved_1[6];
  71         __be64  effective_address;
  72         __be64  logical_address;
  73 } __packed;
  74
  75 /* RTAS pseries MCE error types */
  76 #define MC_ERROR_TYPE_UE                0x00
  77 #define MC_ERROR_TYPE_SLB               0x01
  78 #define MC_ERROR_TYPE_ERAT              0x02
  79 #define MC_ERROR_TYPE_UNKNOWN           0x03
  80 #define MC_ERROR_TYPE_TLB               0x04
  81 #define MC_ERROR_TYPE_D_CACHE           0x05
  82 #define MC_ERROR_TYPE_I_CACHE           0x07
  83
  84 /* RTAS pseries MCE error sub types */
  85 #define MC_ERROR_UE_INDETERMINATE               0
  86 #define MC_ERROR_UE_IFETCH                      1
  87 #define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH      2
  88 #define MC_ERROR_UE_LOAD_STORE                  3
  89 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE  4
  90
  91 #define UE_EFFECTIVE_ADDR_PROVIDED              0x40
  92 #define UE_LOGICAL_ADDR_PROVIDED                0x20
  93
  94 #define MC_ERROR_SLB_PARITY             0
  95 #define MC_ERROR_SLB_MULTIHIT           1
  96 #define MC_ERROR_SLB_INDETERMINATE      2
  97
  98 #define MC_ERROR_ERAT_PARITY            1
  99 #define MC_ERROR_ERAT_MULTIHIT          2
 100 #define MC_ERROR_ERAT_INDETERMINATE     3
 101
 102 #define MC_ERROR_TLB_PARITY             1
 103 #define MC_ERROR_TLB_MULTIHIT           2
 104 #define MC_ERROR_TLB_INDETERMINATE      3
 105
 106 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 107 {
 108         switch (mlog->error_type) {
 109         case    MC_ERROR_TYPE_UE:
 110                 return (mlog->sub_err_type & 0x07);
 111         case    MC_ERROR_TYPE_SLB:
 112         case    MC_ERROR_TYPE_ERAT:
 113         case    MC_ERROR_TYPE_TLB:
 114                 return (mlog->sub_err_type & 0x03);
 115         default:
 116                 return 0;
 117         }
 118 }
 119
 120 /*
 121  * Enable the hotplug interrupt late because processing them may touch other
 122  * devices or systems (e.g. hugepages) that have not been initialized at the
 123  * subsys stage.
 124  */
 125 int __init init_ras_hotplug_IRQ(void)
 126 {
 127         struct device_node *np;
 128
 129         /* Hotplug Events */
 130         np = of_find_node_by_path("/event-sources/hot-plug-events");
 131         if (np != NULL) {
 132                 if (dlpar_workqueue_init() == 0)
 133                         request_event_sources_irqs(np, ras_hotplug_interrupt,
 134                                                    "RAS_HOTPLUG");
 135                 of_node_put(np);
 136         }
 137
 138         return 0;
 139 }
 140 machine_late_initcall(pseries, init_ras_hotplug_IRQ);
 141
 142 /*
 143  * Initialize handlers for the set of interrupts caused by hardware errors
 144  * and power system events.
 145  */
 146 static int __init init_ras_IRQ(void)
 147 {
 148         struct device_node *np;
 149
 150         ras_check_exception_token = rtas_token("check-exception");
 151
 152         /* Internal Errors */
 153         np = of_find_node_by_path("/event-sources/internal-errors");
 154         if (np != NULL) {
 155                 request_event_sources_irqs(np, ras_error_interrupt,
 156                                            "RAS_ERROR");
 157                 of_node_put(np);
 158         }
 159
 160         /* EPOW Events */
 161         np = of_find_node_by_path("/event-sources/epow-events");
 162         if (np != NULL) {
 163                 request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
 164                 of_node_put(np);
 165         }
 166
 167         return 0;
 168 }
 169 machine_subsys_initcall(pseries, init_ras_IRQ);
 170
 171 #define EPOW_SHUTDOWN_NORMAL                            1
 172 #define EPOW_SHUTDOWN_ON_UPS                            2
 173 #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS        3
 174 #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH      4
 175
 176 static void handle_system_shutdown(char event_modifier)
 177 {
 178         switch (event_modifier) {
 179         case EPOW_SHUTDOWN_NORMAL:
 180                 pr_emerg("Power off requested\n");
 181                 orderly_poweroff(true);
 182                 break;
 183
 184         case EPOW_SHUTDOWN_ON_UPS:
 185                 pr_emerg("Loss of system power detected. System is running on"
 186                          " UPS/battery. Check RTAS error log for details\n");
 187                 break;
 188
 189         case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
 190                 pr_emerg("Loss of system critical functions detected. Check"
 191                          " RTAS error log for details\n");
 192                 orderly_poweroff(true);
 193                 break;
 194
 195         case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
 196                 pr_emerg("High ambient temperature detected. Check RTAS"
 197                          " error log for details\n");
 198                 orderly_poweroff(true);
 199                 break;
 200
 201         default:
 202                 pr_err("Unknown power/cooling shutdown event (modifier = %d)\n",
 203                         event_modifier);
 204         }
 205 }
 206
 207 struct epow_errorlog {
 208         unsigned char sensor_value;
 209         unsigned char event_modifier;
 210         unsigned char extended_modifier;
 211         unsigned char reserved;
 212         unsigned char platform_reason;
 213 };
 214
 215 #define EPOW_RESET                      0
 216 #define EPOW_WARN_COOLING               1
 217 #define EPOW_WARN_POWER                 2
 218 #define EPOW_SYSTEM_SHUTDOWN            3
 219 #define EPOW_SYSTEM_HALT                4
 220 #define EPOW_MAIN_ENCLOSURE             5
 221 #define EPOW_POWER_OFF                  7
 222
 223 static void rtas_parse_epow_errlog(struct rtas_error_log *log)
 224 {
 225         struct pseries_errorlog *pseries_log;
 226         struct epow_errorlog *epow_log;
 227         char action_code;
 228         char modifier;
 229
 230         pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
 231         if (pseries_log == NULL)
 232                 return;
 233
 234         epow_log = (struct epow_errorlog *)pseries_log->data;
 235         action_code = epow_log->sensor_value & 0xF;     /* bottom 4 bits */
 236         modifier = epow_log->event_modifier & 0xF;      /* bottom 4 bits */
 237
 238         switch (action_code) {
 239         case EPOW_RESET:
 240                 if (num_epow_events) {
 241                         pr_info("Non critical power/cooling issue cleared\n");
 242                         num_epow_events--;
 243                 }
 244                 break;
 245
 246         case EPOW_WARN_COOLING:
 247                 pr_info("Non-critical cooling issue detected. Check RTAS error"
 248                         " log for details\n");
 249                 break;
 250
 251         case EPOW_WARN_POWER:
 252                 pr_info("Non-critical power issue detected. Check RTAS error"
 253                         " log for details\n");
 254                 break;
 255
 256         case EPOW_SYSTEM_SHUTDOWN:
 257                 handle_system_shutdown(modifier);
 258                 break;
 259
 260         case EPOW_SYSTEM_HALT:
 261                 pr_emerg("Critical power/cooling issue detected. Check RTAS"
 262                          " error log for details. Powering off.\n");
 263                 orderly_poweroff(true);
 264                 break;
 265
 266         case EPOW_MAIN_ENCLOSURE:
 267         case EPOW_POWER_OFF:
 268                 pr_emerg("System about to lose power. Check RTAS error log "
 269                          " for details. Powering off immediately.\n");
 270                 emergency_sync();
 271                 kernel_power_off();
 272                 break;
 273
 274         default:
 275                 pr_err("Unknown power/cooling event (action code  = %d)\n",
 276                         action_code);
 277         }
 278
 279         /* Increment epow events counter variable */
 280         if (action_code != EPOW_RESET)
 281                 num_epow_events++;
 282 }
 283
 284 static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id)
 285 {
 286         struct pseries_errorlog *pseries_log;
 287         struct pseries_hp_errorlog *hp_elog;
 288
 289         spin_lock(&ras_log_buf_lock);
 290
 291         rtas_call(ras_check_exception_token, 6, 1, NULL,
 292                   RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
 293                   RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf),
 294                   rtas_get_error_log_max());
 295
 296         pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf,
 297                                            PSERIES_ELOG_SECT_ID_HOTPLUG);
 298         hp_elog = (struct pseries_hp_errorlog *)pseries_log->data;
 299
 300         /*
 301          * Since PCI hotplug is not currently supported on pseries, put PCI
 302          * hotplug events on the ras_log_buf to be handled by rtas_errd.
 303          */
 304         if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM ||
 305             hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU ||
 306             hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM)
 307                 queue_hotplug_event(hp_elog);
 308         else
 309                 log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
 310
 311         spin_unlock(&ras_log_buf_lock);
 312         return IRQ_HANDLED;
 313 }
 314
 315 /* Handle environmental and power warning (EPOW) interrupts. */
 316 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 317 {
 318         int status;
 319         int state;
 320         int critical;
 321
 322         status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
 323                                       &state);
 324
 325         if (state > 3)
 326                 critical = 1;           /* Time Critical */
 327         else
 328                 critical = 0;
 329
 330         spin_lock(&ras_log_buf_lock);
 331
 332         status = rtas_call(ras_check_exception_token, 6, 1, NULL,
 333                            RTAS_VECTOR_EXTERNAL_INTERRUPT,
 334                            virq_to_hw(irq),
 335                            RTAS_EPOW_WARNING,
 336                            critical, __pa(&ras_log_buf),
 337                                 rtas_get_error_log_max());
 338
 339         log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
 340
 341         rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
 342
 343         spin_unlock(&ras_log_buf_lock);
 344         return IRQ_HANDLED;
 345 }
 346
 347 /*
 348  * Handle hardware error interrupts.
 349  *
 350  * RTAS check-exception is called to collect data on the exception.  If
 351  * the error is deemed recoverable, we log a warning and return.
 352  * For nonrecoverable errors, an error is logged and we stop all processing
 353  * as quickly as possible in order to prevent propagation of the failure.
 354  */
 355 static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 356 {
 357         struct rtas_error_log *rtas_elog;
 358         int status;
 359         int fatal;
 360
 361         spin_lock(&ras_log_buf_lock);
 362
 363         status = rtas_call(ras_check_exception_token, 6, 1, NULL,
 364                            RTAS_VECTOR_EXTERNAL_INTERRUPT,
 365                            virq_to_hw(irq),
 366                            RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
 367                            __pa(&ras_log_buf),
 368                                 rtas_get_error_log_max());
 369
 370         rtas_elog = (struct rtas_error_log *)ras_log_buf;
 371
 372         if (status == 0 &&
 373             rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
 374                 fatal = 1;
 375         else
 376                 fatal = 0;
 377
 378         /* format and print the extended information */
 379         log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
 380
 381         if (fatal) {
 382                 pr_emerg("Fatal hardware error detected. Check RTAS error"
 383                          " log for details. Powering off immediately\n");
 384                 emergency_sync();
 385                 kernel_power_off();
 386         } else {
 387                 pr_err("Recoverable hardware error detected\n");
 388         }
 389
 390         spin_unlock(&ras_log_buf_lock);
 391         return IRQ_HANDLED;
 392 }
 393
 394 /*
 395  * Some versions of FWNMI place the buffer inside the 4kB page starting at
 396  * 0x7000. Other versions place it inside the rtas buffer. We check both.
 397  * Minimum size of the buffer is 16 bytes.
 398  */
 399 #define VALID_FWNMI_BUFFER(A) \
 400         ((((A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \
 401         (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16))))
 402
 403 static inline struct rtas_error_log *fwnmi_get_errlog(void)
 404 {
 405         return (struct rtas_error_log *)local_paca->mce_data_buf;
 406 }
 407
 408 static __be64 *fwnmi_get_savep(struct pt_regs *regs)
 409 {
 410         unsigned long savep_ra;
 411
 412         /* Mask top two bits */
 413         savep_ra = regs->gpr[3] & ~(0x3UL << 62);
 414         if (!VALID_FWNMI_BUFFER(savep_ra)) {
 415                 printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
 416                 return NULL;
 417         }
 418
 419         return __va(savep_ra);
 420 }
 421
 422 /*
 423  * Get the error information for errors coming through the
 424  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
 425  * the actual r3 if possible, and a ptr to the error log entry
 426  * will be returned if found.
 427  *
 428  * Use one buffer mce_data_buf per cpu to store RTAS error.
 429  *
 430  * The mce_data_buf does not have any locks or protection around it,
 431  * if a second machine check comes in, or a system reset is done
 432  * before we have logged the error, then we will get corruption in the
 433  * error log.  This is preferable over holding off on calling
 434  * ibm,nmi-interlock which would result in us checkstopping if a
 435  * second machine check did come in.
 436  */
 437 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 438 {
 439         struct rtas_error_log *h;
 440         __be64 *savep;
 441
 442         savep = fwnmi_get_savep(regs);
 443         if (!savep)
 444                 return NULL;
 445
 446         regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
 447
 448         h = (struct rtas_error_log *)&savep[1];
 449         /* Use the per cpu buffer from paca to store rtas error log */
 450         memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
 451         if (!rtas_error_extended(h)) {
 452                 memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
 453         } else {
 454                 int len, error_log_length;
 455
 456                 error_log_length = 8 + rtas_error_extended_log_length(h);
 457                 len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
 458                 memcpy(local_paca->mce_data_buf, h, len);
 459         }
 460
 461         return (struct rtas_error_log *)local_paca->mce_data_buf;
 462 }
 463
 464 /* Call this when done with the data returned by FWNMI_get_errinfo.
 465  * It will release the saved data area for other CPUs in the
 466  * partition to receive FWNMI errors.
 467  */
 468 static void fwnmi_release_errinfo(void)
 469 {
 470         struct rtas_args rtas_args;
 471         int ret;
 472
 473         /*
 474          * On pseries, the machine check stack is limited to under 4GB, so
 475          * args can be on-stack.
 476          */
 477         rtas_call_unlocked(&rtas_args, ibm_nmi_interlock_token, 0, 1, NULL);
 478         ret = be32_to_cpu(rtas_args.rets[0]);
 479         if (ret != 0)
 480                 printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
 481 }
 482
 483 int pSeries_system_reset_exception(struct pt_regs *regs)
 484 {
 485 #ifdef __LITTLE_ENDIAN__
 486         /*
 487          * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try
 488          * to detect the bad SRR1 pattern here. Flip the NIP back to correct
 489          * endian for reporting purposes. Unfortunately the MSR can't be fixed,
 490          * so clear it. It will be missing MSR_RI so we won't try to recover.
 491          */
 492         if ((be64_to_cpu(regs->msr) &
 493                         (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR|
 494                          MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) {
 495                 regs->nip = be64_to_cpu((__be64)regs->nip);
 496                 regs->msr = 0;
 497         }
 498 #endif
 499
 500         if (fwnmi_active) {
 501                 __be64 *savep;
 502
 503                 /*
 504                  * Firmware (PowerVM and KVM) saves r3 to a save area like
 505                  * machine check, which is not exactly what PAPR (2.9)
 506                  * suggests but there is no way to detect otherwise, so this
 507                  * is the interface now.
 508                  *
 509                  * System resets do not save any error log or require an
 510                  * "ibm,nmi-interlock" rtas call to release.
 511                  */
 512
 513                 savep = fwnmi_get_savep(regs);
 514                 if (savep)
 515                         regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
 516         }
 517
 518         if (smp_handle_nmi_ipi(regs))
 519                 return 1;
 520
 521         return 0; /* need to perform reset */
 522 }
 523
 524 static int mce_handle_err_realmode(int disposition, u8 error_type)
 525 {
 526 #ifdef CONFIG_PPC_BOOK3S_64
 527         if (disposition == RTAS_DISP_NOT_RECOVERED) {
 528                 switch (error_type) {
 529                 case    MC_ERROR_TYPE_ERAT:
 530                         flush_erat();
 531                         disposition = RTAS_DISP_FULLY_RECOVERED;
 532                         break;
 533                 case    MC_ERROR_TYPE_SLB:
 534                         /*
 535                          * Store the old slb content in paca before flushing.
 536                          * Print this when we go to virtual mode.
 537                          * There are chances that we may hit MCE again if there
 538                          * is a parity error on the SLB entry we trying to read
 539                          * for saving. Hence limit the slb saving to single
 540                          * level of recursion.
 541                          */
 542                         if (local_paca->in_mce == 1)
 543                                 slb_save_contents(local_paca->mce_faulty_slbs);
 544                         flush_and_reload_slb();
 545                         disposition = RTAS_DISP_FULLY_RECOVERED;
 546                         break;
 547                 default:
 548                         break;
 549                 }
 550         } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
 551                 /* Platform corrected itself but could be degraded */
 552                 pr_err("MCE: limited recovery, system may be degraded\n");
 553                 disposition = RTAS_DISP_FULLY_RECOVERED;
 554         }
 555 #endif
 556         return disposition;
 557 }
 558
 559 static int mce_handle_err_virtmode(struct pt_regs *regs,
 560                                    struct rtas_error_log *errp,
 561                                    struct pseries_mc_errorlog *mce_log,
 562                                    int disposition)
 563 {
 564         struct mce_error_info mce_err = { 0 };
 565         int initiator = rtas_error_initiator(errp);
 566         int severity = rtas_error_severity(errp);
 567         unsigned long eaddr = 0, paddr = 0;
 568         u8 error_type, err_sub_type;
 569
 570         if (!mce_log)
 571                 goto out;
 572
 573         error_type = mce_log->error_type;
 574         err_sub_type = rtas_mc_error_sub_type(mce_log);
 575
 576         if (initiator == RTAS_INITIATOR_UNKNOWN)
 577                 mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 578         else if (initiator == RTAS_INITIATOR_CPU)
 579                 mce_err.initiator = MCE_INITIATOR_CPU;
 580         else if (initiator == RTAS_INITIATOR_PCI)
 581                 mce_err.initiator = MCE_INITIATOR_PCI;
 582         else if (initiator == RTAS_INITIATOR_ISA)
 583                 mce_err.initiator = MCE_INITIATOR_ISA;
 584         else if (initiator == RTAS_INITIATOR_MEMORY)
 585                 mce_err.initiator = MCE_INITIATOR_MEMORY;
 586         else if (initiator == RTAS_INITIATOR_POWERMGM)
 587                 mce_err.initiator = MCE_INITIATOR_POWERMGM;
 588         else
 589                 mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 590
 591         if (severity == RTAS_SEVERITY_NO_ERROR)
 592                 mce_err.severity = MCE_SEV_NO_ERROR;
 593         else if (severity == RTAS_SEVERITY_EVENT)
 594                 mce_err.severity = MCE_SEV_WARNING;
 595         else if (severity == RTAS_SEVERITY_WARNING)
 596                 mce_err.severity = MCE_SEV_WARNING;
 597         else if (severity == RTAS_SEVERITY_ERROR_SYNC)
 598                 mce_err.severity = MCE_SEV_SEVERE;
 599         else if (severity == RTAS_SEVERITY_ERROR)
 600                 mce_err.severity = MCE_SEV_SEVERE;
 601         else if (severity == RTAS_SEVERITY_FATAL)
 602                 mce_err.severity = MCE_SEV_FATAL;
 603         else
 604                 mce_err.severity = MCE_SEV_FATAL;
 605
 606         if (severity <= RTAS_SEVERITY_ERROR_SYNC)
 607                 mce_err.sync_error = true;
 608         else
 609                 mce_err.sync_error = false;
 610
 611         mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 612         mce_err.error_class = MCE_ECLASS_UNKNOWN;
 613
 614         switch (error_type) {
 615         case MC_ERROR_TYPE_UE:
 616                 mce_err.error_type = MCE_ERROR_TYPE_UE;
 617                 mce_common_process_ue(regs, &mce_err);
 618                 if (mce_err.ignore_event)
 619                         disposition = RTAS_DISP_FULLY_RECOVERED;
 620                 switch (err_sub_type) {
 621                 case MC_ERROR_UE_IFETCH:
 622                         mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
 623                         break;
 624                 case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
 625                         mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
 626                         break;
 627                 case MC_ERROR_UE_LOAD_STORE:
 628                         mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
 629                         break;
 630                 case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
 631                         mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
 632                         break;
 633                 case MC_ERROR_UE_INDETERMINATE:
 634                 default:
 635                         mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
 636                         break;
 637                 }
 638                 if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
 639                         eaddr = be64_to_cpu(mce_log->effective_address);
 640
 641                 if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
 642                         paddr = be64_to_cpu(mce_log->logical_address);
 643                 } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
 644                         unsigned long pfn;
 645
 646                         pfn = addr_to_pfn(regs, eaddr);
 647                         if (pfn != ULONG_MAX)
 648                                 paddr = pfn << PAGE_SHIFT;
 649                 }
 650
 651                 break;
 652         case MC_ERROR_TYPE_SLB:
 653                 mce_err.error_type = MCE_ERROR_TYPE_SLB;
 654                 switch (err_sub_type) {
 655                 case MC_ERROR_SLB_PARITY:
 656                         mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
 657                         break;
 658                 case MC_ERROR_SLB_MULTIHIT:
 659                         mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
 660                         break;
 661                 case MC_ERROR_SLB_INDETERMINATE:
 662                 default:
 663                         mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
 664                         break;
 665                 }
 666                 if (mce_log->sub_err_type & 0x80)
 667                         eaddr = be64_to_cpu(mce_log->effective_address);
 668                 break;
 669         case MC_ERROR_TYPE_ERAT:
 670                 mce_err.error_type = MCE_ERROR_TYPE_ERAT;
 671                 switch (err_sub_type) {
 672                 case MC_ERROR_ERAT_PARITY:
 673                         mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
 674                         break;
 675                 case MC_ERROR_ERAT_MULTIHIT:
 676                         mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
 677                         break;
 678                 case MC_ERROR_ERAT_INDETERMINATE:
 679                 default:
 680                         mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
 681                         break;
 682                 }
 683                 if (mce_log->sub_err_type & 0x80)
 684                         eaddr = be64_to_cpu(mce_log->effective_address);
 685                 break;
 686         case MC_ERROR_TYPE_TLB:
 687                 mce_err.error_type = MCE_ERROR_TYPE_TLB;
 688                 switch (err_sub_type) {
 689                 case MC_ERROR_TLB_PARITY:
 690                         mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
 691                         break;
 692                 case MC_ERROR_TLB_MULTIHIT:
 693                         mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
 694                         break;
 695                 case MC_ERROR_TLB_INDETERMINATE:
 696                 default:
 697                         mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
 698                         break;
 699                 }
 700                 if (mce_log->sub_err_type & 0x80)
 701                         eaddr = be64_to_cpu(mce_log->effective_address);
 702                 break;
 703         case MC_ERROR_TYPE_D_CACHE:
 704                 mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 705                 break;
 706         case MC_ERROR_TYPE_I_CACHE:
 707                 mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 708                 break;
 709         case MC_ERROR_TYPE_UNKNOWN:
 710         default:
 711                 mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 712                 break;
 713         }
 714 out:
 715         save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
 716                        &mce_err, regs->nip, eaddr, paddr);
 717         return disposition;
 718 }
 719
 720 static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 721 {
 722         struct pseries_errorlog *pseries_log;
 723         struct pseries_mc_errorlog *mce_log = NULL;
 724         int disposition = rtas_error_disposition(errp);
 725         u8 error_type;
 726
 727         if (!rtas_error_extended(errp))
 728                 goto out;
 729
 730         pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
 731         if (!pseries_log)
 732                 goto out;
 733
 734         mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
 735         error_type = mce_log->error_type;
 736
 737         disposition = mce_handle_err_realmode(disposition, error_type);
 738
 739         /*
 740          * Enable translation as we will be accessing per-cpu variables
 741          * in save_mce_event() which may fall outside RMO region, also
 742          * leave it enabled because subsequently we will be queuing work
 743          * to workqueues where again per-cpu variables accessed, besides
 744          * fwnmi_release_errinfo() crashes when called in realmode on
 745          * pseries.
 746          * Note: All the realmode handling like flushing SLB entries for
 747          *       SLB multihit is done by now.
 748          */
 749 out:
 750         mtmsr(mfmsr() | MSR_IR | MSR_DR);
 751         disposition = mce_handle_err_virtmode(regs, errp, mce_log,
 752                                               disposition);
 753         return disposition;
 754 }
 755
 756 /*
 757  * Process MCE rtas errlog event.
 758  */
 759 static void mce_process_errlog_event(struct irq_work *work)
 760 {
 761         struct rtas_error_log *err;
 762
 763         err = fwnmi_get_errlog();
 764         log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
 765 }
 766
 767 /*
 768  * See if we can recover from a machine check exception.
 769  * This is only called on power4 (or above) and only via
 770  * the Firmware Non-Maskable Interrupts (fwnmi) handler
 771  * which provides the error analysis for us.
 772  *
 773  * Return 1 if corrected (or delivered a signal).
 774  * Return 0 if there is nothing we can do.
 775  */
 776 static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
 777 {
 778         int recovered = 0;
 779
 780         if (!(regs->msr & MSR_RI)) {
 781                 /* If MSR_RI isn't set, we cannot recover */
 782                 pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
 783                 recovered = 0;
 784         } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
 785                 /* Platform corrected itself */
 786                 recovered = 1;
 787         } else if (evt->severity == MCE_SEV_FATAL) {
 788                 /* Fatal machine check */
 789                 pr_err("Machine check interrupt is fatal\n");
 790                 recovered = 0;
 791         }
 792
 793         if (!recovered && evt->sync_error) {
 794                 /*
 795                  * Try to kill processes if we get a synchronous machine check
 796                  * (e.g., one caused by execution of this instruction). This
 797                  * will devolve into a panic if we try to kill init or are in
 798                  * an interrupt etc.
 799                  *
 800                  * TODO: Queue up this address for hwpoisioning later.
 801                  * TODO: This is not quite right for d-side machine
 802                  *       checks ->nip is not necessarily the important
 803                  *       address.
 804                  */
 805                 if ((user_mode(regs))) {
 806                         _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
 807                         recovered = 1;
 808                 } else if (die_will_crash()) {
 809                         /*
 810                          * die() would kill the kernel, so better to go via
 811                          * the platform reboot code that will log the
 812                          * machine check.
 813                          */
 814                         recovered = 0;
 815                 } else {
 816                         die("Machine check", regs, SIGBUS);
 817                         recovered = 1;
 818                 }
 819         }
 820
 821         return recovered;
 822 }
 823
 824 /*
 825  * Handle a machine check.
 826  *
 827  * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
 828  * should be present.  If so the handler which called us tells us if the
 829  * error was recovered (never true if RI=0).
 830  *
 831  * On hardware prior to Power 4 these exceptions were asynchronous which
 832  * means we can't tell exactly where it occurred and so we can't recover.
 833  */
 834 int pSeries_machine_check_exception(struct pt_regs *regs)
 835 {
 836         struct machine_check_event evt;
 837
 838         if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 839                 return 0;
 840
 841         /* Print things out */
 842         if (evt.version != MCE_V1) {
 843                 pr_err("Machine Check Exception, Unknown event version %d !\n",
 844                        evt.version);
 845                 return 0;
 846         }
 847         machine_check_print_event_info(&evt, user_mode(regs), false);
 848
 849         if (recover_mce(regs, &evt))
 850                 return 1;
 851
 852         return 0;
 853 }
 854
 855 long pseries_machine_check_realmode(struct pt_regs *regs)
 856 {
 857         struct rtas_error_log *errp;
 858         int disposition;
 859
 860         if (fwnmi_active) {
 861                 errp = fwnmi_get_errinfo(regs);
 862                 /*
 863                  * Call to fwnmi_release_errinfo() in real mode causes kernel
 864                  * to panic. Hence we will call it as soon as we go into
 865                  * virtual mode.
 866                  */
 867                 disposition = mce_handle_error(regs, errp);
 868                 fwnmi_release_errinfo();
 869
 870                 /* Queue irq work to log this rtas event later. */
 871                 irq_work_queue(&mce_errlog_process_work);
 872
 873                 if (disposition == RTAS_DISP_FULLY_RECOVERED)
 874                         return 1;
 875         }
 876
 877         return 0;
 878 }