blob: d2cca90a8c1d94ae4aaaf81614d8f1eb702e224a [file] [log] [blame]
/* Copyright 2013-2014 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <skiboot.h>
#include <opal.h>
#include <opal-msg.h>
#include <processor.h>
#include <chiptod.h>
#include <lock.h>
#include <xscom.h>
#include <capp.h>
#include <pci.h>
#include <cpu.h>
#include <chip.h>
/*
* HMER register layout:
* +===+==========+============================+========+===================+
* |Bit|Name |Description |PowerKVM|Action |
* | | | |HMI | |
* | | | |enabled | |
* | | | |for this| |
* | | | |bit ? | |
* +===+==========+============================+========+===================+
* |0 |malfunctio|A processor core in the |Yes |Raise attn from |
* | |n_allert |system has checkstopped | |sapphire resulting |
* | | |(failed recovery) and has | |xstop |
* | | |requested a CP Sparing | | |
* | | |to occur. This is | | |
* | | |broadcasted to every | | |
* | | |processor in the system | | |
* |---+----------+----------------------------+--------+-------------------|
* |1 |Reserved |reserved |n/a | |
* |---+----------+----------------------------+--------+-------------------|
* |2 |proc_recv_|Processor recovery occurred |Yes |Log message and |
* | |done |error-bit in fir not masked | |continue working. |
* | | |(see bit 11) | | |
* |---+----------+----------------------------+--------+-------------------|
* |3 |proc_recv_|Processor went through |Yes |Log message and |
* | |error_mask|recovery for an error which | |continue working. |
* | |ed |is actually masked for | | |
* | | |reporting | | |
* |---+----------+----------------------------+--------+-------------------|
* |4 | |Timer facility experienced |Yes |Raise attn from |
* | |tfac_error|an error. | |sapphire resulting |
* | | |TB, DEC, HDEC, PURR or SPURR| |xstop |
* | | |may be corrupted (details in| | |
* | | |TFMR) | | |
* |---+----------+----------------------------+--------+-------------------|
* |5 | |TFMR SPR itself is |Yes |Raise attn from |
* | |tfmr_parit|corrupted. | |sapphire resulting |
* | |y_error |Entire timing facility may | |xstop |
* | | |be compromised. | | |
* |---+----------+----------------------------+--------+-------------------|
* |6 |ha_overflo| UPS (Uniterrupted Power |No |N/A |
* | |w_warning |System) Overflow indication | | |
* | | |indicating that the UPS | | |
* | | |DirtyAddrTable has | | |
* | | |reached a limit where it | | |
* | | |requires PHYP unload support| | |
* |---+----------+----------------------------+--------+-------------------|
* |7 |reserved |reserved |n/a |n/a |
* |---+----------+----------------------------+--------+-------------------|
* |8 |xscom_fail|An XSCOM operation caused by|No |We handle it by |
* | | |a cache inhibited load/store| |manually reading |
* | | |from this thread failed. A | |HMER register. |
* | | |trap register is | | |
* | | |available. | | |
* | | | | | |
* |---+----------+----------------------------+--------+-------------------|
* |9 |xscom_done|An XSCOM operation caused by|No |We handle it by |
* | | |a cache inhibited load/store| |manually reading |
* | | |from this thread completed. | |HMER register. |
* | | |If hypervisor | | |
* | | |intends to use this bit, it | | |
* | | |is responsible for clearing | | |
* | | |it before performing the | | |
* | | |xscom operation. | | |
* | | |NOTE: this bit should always| | |
* | | |be masked in HMEER | | |
* |---+----------+----------------------------+--------+-------------------|
* |10 |reserved |reserved |n/a |n/a |
* |---+----------+----------------------------+--------+-------------------|
* |11 |proc_recv_|Processor recovery occurred |y |Log message and |
* | |again |again before bit2 or bit3 | |continue working. |
* | | |was cleared | | |
* |---+----------+----------------------------+--------+-------------------|
* |12-|reserved |was temperature sensor |n/a |n/a |
* |15 | |passed the critical point on| | |
* | | |the way up | | |
* |---+----------+----------------------------+--------+-------------------|
* |16 | |SCOM has set a reserved FIR |No |n/a |
* | |scom_fir_h|bit to cause recovery | | |
* | |m | | | |
* |---+----------+----------------------------+--------+-------------------|
* |17 |trig_fir_h|Debug trigger has set a |No |n/a |
* | |mi |reserved FIR bit to cause | | |
* | | |recovery | | |
* |---+----------+----------------------------+--------+-------------------|
* |18 |reserved |reserved |n/a |n/a |
* |---+----------+----------------------------+--------+-------------------|
* |19 |reserved |reserved |n/a |n/a |
* |---+----------+----------------------------+--------+-------------------|
* |20 |hyp_resour|A hypervisor resource error |y |Raise attn from |
* | |ce_err |occurred: data parity error | |sapphire resulting |
* | | |on, SPRC0:3; SPR_Modereg or | |xstop. |
* | | |HMEER. | | |
* | | |Note: this bit will cause an| | |
* | | |check_stop when (HV=1, PR=0 | | |
* | | |and EE=0) | | |
* |---+----------+----------------------------+--------+-------------------|
* |21-| |if bit 8 is active, the |No |We handle it by |
* |23 |xscom_stat|reason will be detailed in | |Manually reading |
* | |us |these bits. see chapter 11.1| |HMER register. |
* | | |This bits are information | | |
* | | |only and always masked | | |
* | | |(mask = '0') | | |
* | | |If hypervisor intends to use| | |
* | | |this bit, it is responsible | | |
* | | |for clearing it before | | |
* | | |performing the xscom | | |
* | | |operation. | | |
* |---+----------+----------------------------+--------+-------------------|
* |24-|Not |Not implemented |n/a |n/a |
* |63 |implemente| | | |
* | |d | | | |
* +-- +----------+----------------------------+--------+-------------------+
*
* Above HMER bits can be enabled/disabled by modifying
* SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h
* If you modify support for any of the bits listed above, please make sure
* you change the above table to refelct that.
*
* NOTE: Per Dave Larson, never enable 8,9,21-23
*/
/* Used for tracking cpu threads inside hmi handling. */
#define HMI_STATE_CLEANUP_DONE 0x100
#define CORE_THREAD_MASK 0x0ff
#define SUBCORE_THREAD_MASK(s_id, t_count) \
((((1UL) << (t_count)) - 1) << ((s_id) * (t_count)))
/* xscom addresses for core FIR (Fault Isolation Register) */
#define CORE_FIR 0x10013100
#define NX_STATUS_REG 0x02013040 /* NX status register */
#define NX_DMA_ENGINE_FIR 0x02013100 /* DMA & Engine FIR Data Register */
#define NX_PBI_FIR 0x02013080 /* PowerBus Interface FIR Register */
/*
* Bit 54 from NX status register is set to 1 when HMI interrupt is triggered
* due to NX checksop.
*/
#define NX_HMI_ACTIVE PPC_BIT(54)
/* Number of iterations for the various timeouts */
#define TIMEOUT_LOOPS 20000000
static const struct core_xstop_bit_info {
uint8_t bit; /* CORE FIR bit number */
enum OpalHMI_CoreXstopReason reason;
} xstop_bits[] = {
{ 3, CORE_CHECKSTOP_IFU_REGFILE },
{ 5, CORE_CHECKSTOP_IFU_LOGIC },
{ 8, CORE_CHECKSTOP_PC_DURING_RECOV },
{ 10, CORE_CHECKSTOP_ISU_REGFILE },
{ 12, CORE_CHECKSTOP_ISU_LOGIC },
{ 21, CORE_CHECKSTOP_FXU_LOGIC },
{ 25, CORE_CHECKSTOP_VSU_LOGIC },
{ 26, CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE },
{ 32, CORE_CHECKSTOP_LSU_REGFILE },
{ 36, CORE_CHECKSTOP_PC_FWD_PROGRESS },
{ 38, CORE_CHECKSTOP_LSU_LOGIC },
{ 45, CORE_CHECKSTOP_PC_LOGIC },
{ 48, CORE_CHECKSTOP_PC_HYP_RESOURCE },
{ 52, CORE_CHECKSTOP_PC_HANG_RECOV_FAILED },
{ 54, CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED },
{ 60, CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ },
{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
};
static const struct nx_xstop_bit_info {
uint8_t bit; /* NX FIR bit number */
enum OpalHMI_NestAccelXstopReason reason;
} nx_dma_xstop_bits[] = {
{ 1, NX_CHECKSTOP_SHM_INVAL_STATE_ERR },
{ 15, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1 },
{ 16, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2 },
{ 20, NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR },
{ 21, NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR },
{ 22, NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR },
{ 23, NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR },
{ 24, NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR },
{ 25, NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR },
{ 26, NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR },
{ 27, NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR },
{ 31, NX_CHECKSTOP_DMA_CRB_UE },
{ 32, NX_CHECKSTOP_DMA_CRB_SUE },
};
static const struct nx_xstop_bit_info nx_pbi_xstop_bits[] = {
{ 12, NX_CHECKSTOP_PBI_ISN_UE },
};
static struct lock hmi_lock = LOCK_UNLOCKED;
static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover)
{
uint64_t *hmi_data;
/* Don't queue up event if recover == -1 */
if (recover == -1)
return 0;
/* set disposition */
if (recover == 1)
hmi_evt->disposition = OpalHMI_DISPOSITION_RECOVERED;
else if (recover == 0)
hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
/*
* V2 of struct OpalHMIEvent is of (4 * 64 bits) size and well packed
* structure. Hence use uint64_t pointer to pass entire structure
* using 4 params in generic message format.
*/
hmi_data = (uint64_t *)hmi_evt;
/* queue up for delivery to host. */
return opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
hmi_data[0], hmi_data[1], hmi_data[2],
hmi_data[3]);
}
static int is_capp_recoverable(int chip_id)
{
uint64_t reg;
xscom_read(chip_id, CAPP_ERR_STATUS_CTRL, &reg);
return (reg & PPC_BIT(0)) != 0;
}
static int handle_capp_recoverable(int chip_id)
{
struct dt_node *np;
u64 phb_id;
u32 dt_chip_id;
struct phb *phb;
u32 phb_index;
struct proc_chip *chip = get_chip(chip_id);
u8 mask = chip->capp_phb3_attached_mask;
dt_for_each_compatible(dt_root, np, "ibm,power8-pciex") {
dt_chip_id = dt_prop_get_u32(np, "ibm,chip-id");
phb_index = dt_prop_get_u32(np, "ibm,phb-index");
phb_id = dt_prop_get_u64(np, "ibm,opal-phbid");
if ((mask & (1 << phb_index)) && (chip_id == dt_chip_id)) {
phb = pci_get_phb(phb_id);
phb->ops->lock(phb);
phb->ops->set_capp_recovery(phb);
phb->ops->unlock(phb);
return 1;
}
}
return 0;
}
static int decode_one_malfunction(int flat_chip_id, struct OpalHMIEvent *hmi_evt)
{
hmi_evt->severity = OpalHMI_SEV_FATAL;
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
if (is_capp_recoverable(flat_chip_id)) {
if (handle_capp_recoverable(flat_chip_id) == 0)
return 0;
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
return 1;
}
/* TODO check other FIRs */
return 0;
}
static bool decode_core_fir(struct cpu_thread *cpu,
struct OpalHMIEvent *hmi_evt)
{
uint64_t core_fir;
uint32_t core_id;
int i;
bool found = false;
/* Sanity check */
if (!cpu || !hmi_evt)
return false;
core_id = pir_to_core_id(cpu->pir);
/* Get CORE FIR register value. */
if (xscom_read(cpu->chip_id, XSCOM_ADDR_P8_EX(core_id, CORE_FIR),
&core_fir) != 0) {
prerror("HMI: XSCOM error reading CORE FIR\n");
return false;
}
prlog(PR_INFO, "HMI: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",
cpu->chip_id, core_id, core_fir);
/* Check CORE FIR bits and populate HMI event with error info. */
for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) {
if (core_fir & PPC_BIT(xstop_bits[i].bit)) {
found = true;
hmi_evt->u.xstop_error.xstop_reason
|= xstop_bits[i].reason;
}
}
return found;
}
static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
int *event_generated)
{
struct cpu_thread *cpu;
/* Initialize HMI event */
hmi_evt->severity = OpalHMI_SEV_FATAL;
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_CORE;
/*
* Check CORE FIRs and find the reason for core checkstop.
* Send a separate HMI event for each core that has checkstopped.
*/
for_each_cpu(cpu) {
/* GARDed CPUs are marked unavailable. Skip them. */
if (cpu->state == cpu_state_unavailable)
continue;
/* Only check on primaries (ie. core), not threads */
if (cpu->is_secondary)
continue;
/* Initialize xstop_error fields. */
hmi_evt->u.xstop_error.xstop_reason = 0;
hmi_evt->u.xstop_error.u.pir = cpu->pir;
if (decode_core_fir(cpu, hmi_evt)) {
queue_hmi_event(hmi_evt, 0);
*event_generated = 1;
}
}
}
static void find_nx_checkstop_reason(int flat_chip_id,
struct OpalHMIEvent *hmi_evt, int *event_generated)
{
uint64_t nx_status;
uint64_t nx_dma_fir;
uint64_t nx_pbi_fir;
int i;
/* Get NX status register value. */
if (xscom_read(flat_chip_id, NX_STATUS_REG, &nx_status) != 0) {
prerror("HMI: XSCOM error reading NX_STATUS_REG\n");
return;
}
/* Check if NX has driven an HMI interrupt. */
if (!(nx_status & NX_HMI_ACTIVE))
return;
/* Initialize HMI event */
hmi_evt->severity = OpalHMI_SEV_FATAL;
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NX;
hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
/* Get DMA & Engine FIR data register value. */
if (xscom_read(flat_chip_id, NX_DMA_ENGINE_FIR, &nx_dma_fir) != 0) {
prerror("HMI: XSCOM error reading NX_DMA_ENGINE_FIR\n");
return;
}
/* Get PowerBus Interface FIR data register value. */
if (xscom_read(flat_chip_id, NX_PBI_FIR, &nx_pbi_fir) != 0) {
prerror("HMI: XSCOM error reading NX_DMA_ENGINE_FIR\n");
return;
}
/* Find NX checkstop reason and populate HMI event with error info. */
for (i = 0; i < ARRAY_SIZE(nx_dma_xstop_bits); i++)
if (nx_dma_fir & PPC_BIT(nx_dma_xstop_bits[i].bit))
hmi_evt->u.xstop_error.xstop_reason
|= nx_dma_xstop_bits[i].reason;
for (i = 0; i < ARRAY_SIZE(nx_pbi_xstop_bits); i++)
if (nx_pbi_fir & PPC_BIT(nx_pbi_xstop_bits[i].bit))
hmi_evt->u.xstop_error.xstop_reason
|= nx_pbi_xstop_bits[i].reason;
/*
* Set NXDMAENGFIR[38] to signal PRD that service action is required.
* Without this inject, PRD will not be able to do NX unit checkstop
* error analysis. NXDMAENGFIR[38] is a spare bit and used to report
* a software initiated attention.
*
* The behavior of this bit and all FIR bits are documented in
* RAS spreadsheet.
*/
xscom_write(flat_chip_id, NX_DMA_ENGINE_FIR, PPC_BIT(38));
/* Send an HMI event. */
queue_hmi_event(hmi_evt, 0);
*event_generated = 1;
}
static int decode_malfunction(struct OpalHMIEvent *hmi_evt)
{
int i;
int recover = -1;
uint64_t malf_alert;
int event_generated = 0;
xscom_read(this_cpu()->chip_id, 0x2020011, &malf_alert);
for (i = 0; i < 64; i++)
if (malf_alert & PPC_BIT(i)) {
recover = decode_one_malfunction(i, hmi_evt);
xscom_write(this_cpu()->chip_id, 0x02020011, ~PPC_BIT(i));
if (recover) {
queue_hmi_event(hmi_evt, recover);
event_generated = 1;
}
find_nx_checkstop_reason(i, hmi_evt, &event_generated);
}
if (recover != -1) {
find_core_checkstop_reason(hmi_evt, &event_generated);
/*
* In case, if we fail to find checkstop reason send an
* unknown HMI event.
*/
if (!event_generated) {
hmi_evt->u.xstop_error.xstop_type =
CHECKSTOP_TYPE_UNKNOWN;
hmi_evt->u.xstop_error.xstop_reason = 0;
}
}
return recover;
}
static void wait_for_subcore_threads(void)
{
uint64_t timeout = 0;
while (!(*(this_cpu()->core_hmi_state_ptr) & HMI_STATE_CLEANUP_DONE)) {
/*
* We use a fixed number of TIMEOUT_LOOPS rather
* than using the timebase to do a pseudo-wall time
* timeout due to the fact that timebase may not actually
* work at this point in time.
*/
if (++timeout >= (TIMEOUT_LOOPS*3)) {
/*
* Break out the loop here and fall through
* recovery code. If recovery fails, kernel will get
* informed about the failure. This way we can avoid
* looping here if other threads are stuck.
*/
prlog(PR_DEBUG, "HMI: TB pre-recovery timeout\n");
break;
}
cpu_relax();
}
}
/*
* For successful recovery of TB residue error, remove dirty data
* from TB/HDEC register in each active partition (subcore). Writing
* zero's to TB/HDEC will achieve the same.
*/
static void timer_facility_do_cleanup(uint64_t tfmr)
{
if (tfmr & SPR_TFMR_TB_RESIDUE_ERR) {
/* Reset the TB register to clear the dirty data. */
mtspr(SPR_TBWU, 0);
mtspr(SPR_TBWL, 0);
}
if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) {
/* Reset HDEC register */
mtspr(SPR_HDEC, 0);
}
}
static int get_split_core_mode(void)
{
uint64_t hid0;
hid0 = mfspr(SPR_HID0);
if (hid0 & SPR_HID0_POWER8_2LPARMODE)
return 2;
else if (hid0 & SPR_HID0_POWER8_4LPARMODE)
return 4;
return 1;
}
/*
* Certain TB/HDEC errors leaves dirty data in timebase and hdec register
* which need to cleared before we initiate clear_tb_errors through TFMR[24].
* The cleanup has to be done by once by any one thread from core or subcore.
*
* In split core mode, it is required to clear the dirty data from TB/HDEC
* register by all subcores (active partitions) before we clear tb errors
* through TFMR[24]. The HMI recovery would fail even if one subcore do
* not cleanup the respective TB/HDEC register.
*
* For un-split core, any one thread can do the cleanup.
* For split core, any one thread from each subcore can do the cleanup.
*
* Errors that required pre-recovery cleanup:
* - SPR_TFMR_TB_RESIDUE_ERR
* - SPR_TFMR_HDEC_PARITY_ERROR
*/
static void pre_recovery_cleanup(void)
{
uint64_t hmer;
uint64_t tfmr;
uint32_t sibling_thread_mask;
int split_core_mode, subcore_id, thread_id, threads_per_core;
int i;
hmer = mfspr(SPR_HMER);
/* exit if it is not Time facility error. */
if (!(hmer & SPR_HMER_TFAC_ERROR))
return;
/*
* Exit if it is not the error that leaves dirty data in timebase
* or HDEC register. OR this may be the thread which came in very
* late and recovery is been already done.
*
* TFMR is per [sub]core register. If any one thread on the [sub]core
* does the recovery it reflects in TFMR register and applicable to
* all threads in that [sub]core. Hence take a lock before checking
* TFMR errors. Once a thread from a [sub]core completes the
* recovery, all other threads on that [sub]core will return from
* here.
*
* If TFMR does not show error that we are looking for, return
* from here. We would just fall through recovery code which would
* check for other errors on TFMR and fix them.
*/
lock(&hmi_lock);
tfmr = mfspr(SPR_TFMR);
if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
unlock(&hmi_lock);
return;
}
/* Gather split core information. */
split_core_mode = get_split_core_mode();
threads_per_core = cpu_thread_count / split_core_mode;
/* Prepare core/subcore sibling mask */
thread_id = cpu_get_thread_index(this_cpu());
subcore_id = thread_id / threads_per_core;
sibling_thread_mask = SUBCORE_THREAD_MASK(subcore_id, threads_per_core);
/*
* First thread on the core ?
* if yes, setup the hmi cleanup state to !DONE
*/
if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
*(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
/*
* First thread on subcore ?
* if yes, do cleanup.
*
* Clear TB and wait for other threads (one from each subcore) to
* finish its cleanup work.
*/
if ((*(this_cpu()->core_hmi_state_ptr) & sibling_thread_mask) == 0)
timer_facility_do_cleanup(tfmr);
/*
* Mark this thread bit. This bit will stay on until this thread
* exit from handle_hmi_exception().
*/
*(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
/*
* Check if each subcore has completed the cleanup work.
* if yes, then notify all the threads that we are done with cleanup.
*/
for (i = 0; i < split_core_mode; i++) {
uint32_t subcore_thread_mask =
SUBCORE_THREAD_MASK(i, threads_per_core);
if (!(*(this_cpu()->core_hmi_state_ptr) & subcore_thread_mask))
break;
}
if (i == split_core_mode)
*(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
unlock(&hmi_lock);
/* Wait for other subcore to complete the cleanup. */
wait_for_subcore_threads();
}
static void hmi_exit(void)
{
/* unconditionally unset the thread bit */
*(this_cpu()->core_hmi_state_ptr) &= ~(this_cpu()->thread_mask);
}
int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
{
int recover = 1;
uint64_t tfmr;
/*
* In case of split core, some of the Timer facility errors need
* cleanup to be done before we proceed with the error recovery.
*/
pre_recovery_cleanup();
lock(&hmi_lock);
/*
* Not all HMIs would move TB into invalid state. Set the TB state
* looking at TFMR register. TFMR will tell us correct state of
* TB register.
*/
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
prlog(PR_DEBUG, "HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer);
if (hmi_evt)
hmi_evt->hmer = hmer;
if (hmer & SPR_HMER_PROC_RECV_DONE) {
hmer &= ~SPR_HMER_PROC_RECV_DONE;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
queue_hmi_event(hmi_evt, recover);
}
prlog(PR_DEBUG, "HMI: Processor recovery Done.\n");
}
if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
queue_hmi_event(hmi_evt, recover);
}
prlog(PR_DEBUG, "HMI: Processor recovery Done (masked).\n");
}
if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
queue_hmi_event(hmi_evt, recover);
}
prlog(PR_DEBUG, "HMI: Processor recovery occurred again before"
"bit2 was cleared\n");
}
/* Assert if we see malfunction alert, we can not continue. */
if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
recover = 0;
if (hmi_evt) {
recover = decode_malfunction(hmi_evt);
queue_hmi_event(hmi_evt, recover);
}
}
/* Assert if we see Hypervisor resource error, we can not continue. */
if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
recover = 0;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_FATAL;
hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
queue_hmi_event(hmi_evt, recover);
}
}
/*
* Assert for now for all TOD errors. In future we need to decode
* TFMR and take corrective action wherever required.
*/
if (hmer & SPR_HMER_TFAC_ERROR) {
tfmr = mfspr(SPR_TFMR); /* save original TFMR */
hmer &= ~SPR_HMER_TFAC_ERROR;
recover = chiptod_recover_tb_errors();
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
hmi_evt->type = OpalHMI_ERROR_TFAC;
hmi_evt->tfmr = tfmr;
queue_hmi_event(hmi_evt, recover);
}
}
if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
tfmr = mfspr(SPR_TFMR); /* save original TFMR */
hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
recover = chiptod_recover_tb_errors();
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_FATAL;
hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
hmi_evt->tfmr = tfmr;
queue_hmi_event(hmi_evt, recover);
}
}
/*
* HMER bits are sticky, once set to 1 they remain set to 1 until
* they are set to 0. Reset the error source bit to 0, otherwise
* we keep getting HMI interrupt again and again.
*/
mtspr(SPR_HMER, hmer);
hmi_exit();
/* Set the TB state looking at TFMR register before we head out. */
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
unlock(&hmi_lock);
return recover;
}
static int64_t opal_handle_hmi(void)
{
uint64_t hmer;
struct OpalHMIEvent hmi_evt;
/*
* Compiled time check to see size of OpalHMIEvent do not exceed
* that of struct opal_msg.
*/
BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
hmi_evt.version = OpalHMIEvt_V2;
hmer = mfspr(SPR_HMER); /* Get HMER register value */
handle_hmi_exception(hmer, &hmi_evt);
return OPAL_SUCCESS;
}
opal_call(OPAL_HANDLE_HMI, opal_handle_hmi, 0);