arch/powerpc/platforms/powernv/opal-hmi.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
   4  *
   5  * Copyright 2014 IBM Corporation
   6  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
   7  */
   8
   9 #undef DEBUG
  10
  11 #include <linux/kernel.h>
  12 #include <linux/init.h>
  13 #include <linux/of.h>
  14 #include <linux/mm.h>
  15 #include <linux/slab.h>
  16
  17 #include <asm/opal.h>
  18 #include <asm/cputable.h>
  19 #include <asm/machdep.h>
  20
  21 #include "powernv.h"
  22
  23 static int opal_hmi_handler_nb_init;
  24 struct OpalHmiEvtNode {
  25         struct list_head list;
  26         struct OpalHMIEvent hmi_evt;
  27 };
  28
  29 struct xstop_reason {
  30         uint32_t xstop_reason;
  31         const char *unit_failed;
  32         const char *description;
  33 };
  34
  35 static LIST_HEAD(opal_hmi_evt_list);
  36 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
  37
  38 static void print_core_checkstop_reason(const char *level,
  39                                         struct OpalHMIEvent *hmi_evt)
  40 {
  41         int i;
  42         static const struct xstop_reason xstop_reason[] = {
  43                 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
  44                                 "RegFile core check stop" },
  45                 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
  46                 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
  47                                 "Core checkstop during recovery" },
  48                 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
  49                                 "RegFile core check stop (mapper error)" },
  50                 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
  51                 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
  52                 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
  53                 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
  54                                 "Recovery in maintenance mode" },
  55                 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
  56                                 "RegFile core check stop" },
  57                 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
  58                                 "Forward Progress Error" },
  59                 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
  60                 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
  61                 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
  62                                 "Hypervisor Resource error - core check stop" },
  63                 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
  64                                 "Hang Recovery Failed (core check stop)" },
  65                 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
  66                                 "Ambiguous Hang Detected (unknown source)" },
  67                 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
  68                                 "Debug Trigger Error inject" },
  69                 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
  70                                 "Hypervisor check stop via SPRC/SPRD" },
  71         };
  72
  73         /* Validity check */
  74         if (!hmi_evt->u.xstop_error.xstop_reason) {
  75                 printk("%s      Unknown Core check stop.\n", level);
  76                 return;
  77         }
  78
  79         printk("%s      CPU PIR: %08x\n", level,
  80                         be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
  81         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
  82                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
  83                                         xstop_reason[i].xstop_reason)
  84                         printk("%s      [Unit: %-3s] %s\n", level,
  85                                         xstop_reason[i].unit_failed,
  86                                         xstop_reason[i].description);
  87 }
  88
  89 static void print_nx_checkstop_reason(const char *level,
  90                                         struct OpalHMIEvent *hmi_evt)
  91 {
  92         int i;
  93         static const struct xstop_reason xstop_reason[] = {
  94                 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
  95                                         "SHM invalid state error" },
  96                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
  97                                         "DMA invalid state error bit 15" },
  98                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
  99                                         "DMA invalid state error bit 16" },
 100                 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
 101                                         "Channel 0 invalid state error" },
 102                 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
 103                                         "Channel 1 invalid state error" },
 104                 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
 105                                         "Channel 2 invalid state error" },
 106                 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
 107                                         "Channel 3 invalid state error" },
 108                 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
 109                                         "Channel 4 invalid state error" },
 110                 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
 111                                         "Channel 5 invalid state error" },
 112                 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
 113                                         "Channel 6 invalid state error" },
 114                 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
 115                                         "Channel 7 invalid state error" },
 116                 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
 117                                         "UE error on CRB(CSB address, CCB)" },
 118                 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
 119                                         "SUE error on CRB(CSB address, CCB)" },
 120                 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
 121                 "CRB Kill ISN received while holding ISN with UE error" },
 122         };
 123
 124         /* Validity check */
 125         if (!hmi_evt->u.xstop_error.xstop_reason) {
 126                 printk("%s      Unknown NX check stop.\n", level);
 127                 return;
 128         }
 129
 130         printk("%s      NX checkstop on CHIP ID: %x\n", level,
 131                         be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
 132         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
 133                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
 134                                         xstop_reason[i].xstop_reason)
 135                         printk("%s      [Unit: %-3s] %s\n", level,
 136                                         xstop_reason[i].unit_failed,
 137                                         xstop_reason[i].description);
 138 }
 139
 140 static void print_npu_checkstop_reason(const char *level,
 141                                         struct OpalHMIEvent *hmi_evt)
 142 {
 143         uint8_t reason, reason_count, i;
 144
 145         /*
 146          * We may not have a checkstop reason on some combination of
 147          * hardware and/or skiboot version
 148          */
 149         if (!hmi_evt->u.xstop_error.xstop_reason) {
 150                 printk("%s      NPU checkstop on chip %x\n", level,
 151                         be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
 152                 return;
 153         }
 154
 155         /*
 156          * NPU2 has 3 FIRs. Reason encoded on a byte as:
 157          *   2 bits for the FIR number
 158          *   6 bits for the bit number
 159          * It may be possible to find several reasons.
 160          *
 161          * We don't display a specific message per FIR bit as there
 162          * are too many and most are meaningless without the workbook
 163          * and/or hw team help anyway.
 164          */
 165         reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
 166                 sizeof(reason);
 167         for (i = 0; i < reason_count; i++) {
 168                 reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
 169                 if (reason)
 170                         printk("%s      NPU checkstop on chip %x: FIR%d bit %d is set\n",
 171                                 level,
 172                                 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
 173                                 reason >> 6, reason & 0x3F);
 174         }
 175 }
 176
 177 static void print_checkstop_reason(const char *level,
 178                                         struct OpalHMIEvent *hmi_evt)
 179 {
 180         uint8_t type = hmi_evt->u.xstop_error.xstop_type;
 181         switch (type) {
 182         case CHECKSTOP_TYPE_CORE:
 183                 print_core_checkstop_reason(level, hmi_evt);
 184                 break;
 185         case CHECKSTOP_TYPE_NX:
 186                 print_nx_checkstop_reason(level, hmi_evt);
 187                 break;
 188         case CHECKSTOP_TYPE_NPU:
 189                 print_npu_checkstop_reason(level, hmi_evt);
 190                 break;
 191         default:
 192                 printk("%s      Unknown Malfunction Alert of type %d\n",
 193                        level, type);
 194                 break;
 195         }
 196 }
 197
 198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
 199 {
 200         const char *level, *sevstr, *error_info;
 201         static const char *hmi_error_types[] = {
 202                 "Malfunction Alert",
 203                 "Processor Recovery done",
 204                 "Processor recovery occurred again",
 205                 "Processor recovery occurred for masked error",
 206                 "Timer facility experienced an error",
 207                 "TFMR SPR is corrupted",
 208                 "UPS (Uninterrupted Power System) Overflow indication",
 209                 "An XSCOM operation failure",
 210                 "An XSCOM operation completed",
 211                 "SCOM has set a reserved FIR bit to cause recovery",
 212                 "Debug trigger has set a reserved FIR bit to cause recovery",
 213                 "A hypervisor resource error occurred",
 214                 "CAPP recovery process is in progress",
 215         };
 216         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 217                                       DEFAULT_RATELIMIT_BURST);
 218
 219         /* Print things out */
 220         if (hmi_evt->version < OpalHMIEvt_V1) {
 221                 pr_err("HMI Interrupt, Unknown event version %d !\n",
 222                         hmi_evt->version);
 223                 return;
 224         }
 225         switch (hmi_evt->severity) {
 226         case OpalHMI_SEV_NO_ERROR:
 227                 level = KERN_INFO;
 228                 sevstr = "Harmless";
 229                 break;
 230         case OpalHMI_SEV_WARNING:
 231                 level = KERN_WARNING;
 232                 sevstr = "";
 233                 break;
 234         case OpalHMI_SEV_ERROR_SYNC:
 235                 level = KERN_ERR;
 236                 sevstr = "Severe";
 237                 break;
 238         case OpalHMI_SEV_FATAL:
 239         default:
 240                 level = KERN_ERR;
 241                 sevstr = "Fatal";
 242                 break;
 243         }
 244
 245         if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
 246                 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
 247                         level, sevstr,
 248                         hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
 249                         "Recovered" : "Not recovered");
 250                 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
 251                                 hmi_error_types[hmi_evt->type]
 252                                 : "Unknown";
 253                 printk("%s Error detail: %s\n", level, error_info);
 254                 printk("%s      HMER: %016llx\n", level,
 255                                         be64_to_cpu(hmi_evt->hmer));
 256                 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
 257                         (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
 258                         printk("%s      TFMR: %016llx\n", level,
 259                                                 be64_to_cpu(hmi_evt->tfmr));
 260         }
 261
 262         if (hmi_evt->version < OpalHMIEvt_V2)
 263                 return;
 264
 265         /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
 266         if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
 267                 print_checkstop_reason(level, hmi_evt);
 268 }
 269
 270 static void hmi_event_handler(struct work_struct *work)
 271 {
 272         unsigned long flags;
 273         struct OpalHMIEvent *hmi_evt;
 274         struct OpalHmiEvtNode *msg_node;
 275         uint8_t disposition;
 276         struct opal_msg msg;
 277         int unrecoverable = 0;
 278
 279         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 280         while (!list_empty(&opal_hmi_evt_list)) {
 281                 msg_node = list_entry(opal_hmi_evt_list.next,
 282                                            struct OpalHmiEvtNode, list);
 283                 list_del(&msg_node->list);
 284                 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 285
 286                 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
 287                 print_hmi_event_info(hmi_evt);
 288                 disposition = hmi_evt->disposition;
 289                 kfree(msg_node);
 290
 291                 /*
 292                  * Check if HMI event has been recovered or not. If not
 293                  * then kernel can't continue, we need to panic.
 294                  * But before we do that, display all the HMI event
 295                  * available on the list and set unrecoverable flag to 1.
 296                  */
 297                 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
 298                         unrecoverable = 1;
 299
 300                 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 301         }
 302         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 303
 304         if (unrecoverable) {
 305                 /* Pull all HMI events from OPAL before we panic. */
 306                 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
 307                         u32 type;
 308
 309                         type = be32_to_cpu(msg.msg_type);
 310
 311                         /* skip if not HMI event */
 312                         if (type != OPAL_MSG_HMI_EVT)
 313                                 continue;
 314
 315                         /* HMI event info starts from param[0] */
 316                         hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
 317                         print_hmi_event_info(hmi_evt);
 318                 }
 319
 320                 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
 321         }
 322 }
 323
 324 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
 325 /*
 326  * opal_handle_hmi_event - notifier handler that queues up HMI events
 327  * to be preocessed later.
 328  */
 329 static int opal_handle_hmi_event(struct notifier_block *nb,
 330                           unsigned long msg_type, void *msg)
 331 {
 332         unsigned long flags;
 333         struct OpalHMIEvent *hmi_evt;
 334         struct opal_msg *hmi_msg = msg;
 335         struct OpalHmiEvtNode *msg_node;
 336
 337         /* Sanity Checks */
 338         if (msg_type != OPAL_MSG_HMI_EVT)
 339                 return 0;
 340
 341         /* HMI event info starts from param[0] */
 342         hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
 343
 344         /* Delay the logging of HMI events to workqueue. */
 345         msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
 346         if (!msg_node) {
 347                 pr_err("HMI: out of memory, Opal message event not handled\n");
 348                 return -ENOMEM;
 349         }
 350         memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
 351
 352         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 353         list_add(&msg_node->list, &opal_hmi_evt_list);
 354         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 355
 356         schedule_work(&hmi_event_work);
 357         return 0;
 358 }
 359
 360 static struct notifier_block opal_hmi_handler_nb = {
 361         .notifier_call  = opal_handle_hmi_event,
 362         .next           = NULL,
 363         .priority       = 0,
 364 };
 365
 366 int __init opal_hmi_handler_init(void)
 367 {
 368         int ret;
 369
 370         if (!opal_hmi_handler_nb_init) {
 371                 ret = opal_message_notifier_register(
 372                                 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
 373                 if (ret) {
 374                         pr_err("%s: Can't register OPAL event notifier (%d)\n",
 375                                __func__, ret);
 376                         return ret;
 377                 }
 378                 opal_hmi_handler_nb_init = 1;
 379         }
 380         return 0;
 381 }