blob: 5f295fdc962b3b2dc661bc61ec6de89874dafa4c [file] [log] [blame]
#!/usr/bin/env python3
import sys
import ppc
import re
# Mnemonic PHB_ESR - Address Offset 0x0C80 - phbErrorStatusRegister
phb_esr_bits = [
(0, "ETU/RSB Request Address Error"),
(1, "Fundamental A Request Address Error"),
(2, "Fundamental A Request Size/Alignment Error"),
(3, "Fundamental A PCI CFG Addr/Size Error"),
(4, "Fundamental A IODA Table Access Error"),
(5, "Fundamental A Internal Registers Parity Error"),
(6, "PHB Error Registers Request Address Error"),
(7, "PHB Error Registers Request Size/Alignment Error"),
(8, "Fundamental B Request Address Error"),
(9, "Fundamental B Request Size/Alignment Error"),
(10, "Fundamental B Internal Registers Parity Error"),
(11, "Internal Bus Logic Bad PCIE Macro Request Address"),
(12, "Debug Request Address Error"),
(13, "Debug Request Size/Alignment Error"),
(14, "Debug Internal Registers Parity Error"),
(15, "Internal Bus Logic State Machine One-Hot Error"),
(16, "UV Page Request Address Error"),
(17, "UV Page Request Size/Alignment Error"),
(18, "UV Page Internal Registers Parity Error"),
(20, "RXE_ARB OR Error Status"),
(21, "RXE_MRG OR Error Status"),
(22, "RXE_TCE OR Error Status"),
(23, "TXE OR Error Status"),
(24, "pcie_etu_regb_err_inf"),
(25, "pcie_etu_regb_err_erc"),
(26, "pcie_etu_regb_err_fat"),
(27, "bus_regs_req_wr_data_p_e"),
(28, "SCOM HV Indirect Access Error"),
(29, "SCOM UV Indirect Access Error"),
(30, "SCOM Internal Registers Parity Error"),
(31, "SCOM Satellite Finite State Machine Error"),
]
# Mnemonic TXE_ESR - Address Offset 0x0D00 - txeFirstErrorStatus
txe_esr_bits = [
(0, "AIB Command Invalid"),
(2, "AIB Address Decode Error"),
(3, "AIB Size Invalid"),
(4, "AIB Cmd Ctrls Parity Error"),
(5, "AIB Data Ctrls Parity Error"),
(8, "AIB Alignment Error"),
(9, "AIB Cmd Bus Parity Error"),
(10, "AIB Data Bus UE ECC Error"),
(11, "AIB Data Ctrls Sequence Error"),
(12, "AIB Data Bus CE ECC Error"),
(13, "TCE Rd Response DAT_ERR Indication"),
(14, "AIB Command Credits Error"),
(15, "AIB Data Credits Error"),
(16, "BLIF Controls Parity Error"),
(17, "CFG Write Error CA or UR response"),
(18, "BLIF Forward Progress Timeout"),
(19, "MMIO RD Pending Error"),
(20, "MMIO WR Pending Error"),
(21, "MMIO CFG Pending Error"),
(22, "MMIO Write DAT_ERR Indication"),
(23, "CI Store Data Fifo Error"),
(24, "CFG Enable Error, RRB"),
(25, "CFG Size Error"),
(26, "CFG Bus Address Error"),
(27, "CFG Link Down Error"),
(28, "PAPR TXE Injection Error Triggered"),
(29, "CFG Write Request Timeout"),
(30, "PAPR TXE Injection Error Triggered"),
(36, "CI Trigger Buffer ECC Correctable Error"),
(37, "CI Trigger Buffer ECC Uncorrectable Error"),
(38, "CI Trigger Buffer Stage Data Parity Error"),
(40, "MMIO BAR Table (MBT) Parity Error"),
(42, "MMIO Domain Table (MDT) ECC Correctable Error"),
(43, "MMIO Domain Table (MDT) ECC Uncorrectable Error"),
(44, "MMIO Domain Table (MDT) Stage Parity Error"),
(45, "MMIO Domain Table (MDT) Stage Valid Error"),
(46, "AIB Data Special Uncorrectable Error (SUE)"),
(47, "MMIO Domain Table (MDT)"),
(48, "P2P Store Data Fifo Error"),
(49, "EPAT Table Parity Error"),
(50, "MMIO Cmd Parity Error"),
(51, "BLIF1 Reg Parity Error"),
(52, "P2P1 Reg Parity Error"),
(53, "P2P WR Pending Error"),
(54, "CRW Onehot Error"),
(55, "CRW Pending Error"),
(56, "RRB Parity Error"),
(57, "RRB Size/Alignment Error"),
(58, "s_bad_addr_e_q"),
(59, "s_req_size_align_e_q"),
]
# Mnemonic RXE_ARB_ESR - Address Offset 0x0D80 - phbRxeArbErrorStatus
rxe_arb_bits = [
(0, "BLIF Inbound CA Completion Error"),
(1, "BLIF Inbound UR Completion Error"),
(2, "MSI Size Error"),
(3, "MSI Address Alignment Error"),
(5, "BLIF Inbound Header ECC Correctable (CE)"),
(6, "BLIF Inbound Header ECC Uncorrectable (UE)"),
(7, "ARB Stage Valid Error"),
(8, "TCE Tag Release Unused"),
(9, "TCE Tag Used, Not Free"),
(10, "ARB MMIO Buffer Overflow"),
(11, "ARB MMIO Buffer Underflow"),
(12, "ARB MMIO Internal Parity Error"),
(13, "ARB DMA Buffer Overflow"),
(14, "ARB DMA Buffer Underflow"),
(15, "ARB DMA Internal Parity Error"),
(16, "BLIF Header Control Bits Parity Error"),
(17, "BLIF Data Control Bits Parity Error"),
(18, "BLIF Unsupported Request (UR) Error"),
(19, "BLIF Completion Timeout Error"),
(20, "SEID Table ECC Correctable (CE)"),
(21, "SEID Table ECC Uncorrectable (UE)"),
(22, "NBW Size Error"),
(23, "DEC IODA Table Fatal Error"),
(24, "TLP Poisoned Error"),
(25, "MIST ECC Correctable Error"),
(26, "IODA TVT Entry Invalid"),
(27, "MSI PE# Mismatch"),
(28, "IODA TVT Address"),
(29, "TVT ECC Correctable Error"),
(30, "TVT ECC Uncorrectable Error"),
(31, "MIST ECC Uncorrectable Error"),
(32, "PELT-V BAR Disabled Error"),
(33, "IODA Table Parity Error"),
(34, "PCT Timeout"),
(35, "PCT Unexpected Completion"),
(36, "PCT Parity Error"),
(37, "DEC Stage Valid Error"),
(38, "DEC Stage Parity Error"),
(39, "PAPR Inbound Injection Error Triggered"),
(40, "DMA/MSI: RTE PE Number"),
(41, "RTT BAR Disabled Error"),
(42, "RTC Internal Parity Error"),
(43, "RTC Queue Overflow"),
(44, "RTC Queue Underflow"),
(45, "RTC Stage Valid Error"),
(46, "RTC RCAM Bad State Error"),
(47, "RTC RCAM Multiple Hit Error"),
(48, "RRB Parity Error"),
(49, "RRB request Size / Alignment Error"),
(50, "s_bad_addr_e_q"),
(51, "s_req_size_align_e_q"),
(54, "Discontiguous DMA Write Fragmentation"),
(55, "LIST Table Parity Error"),
(56, "LKP PEST Data Queue Error"),
(57, "PCIE Fatal Error Message Received"),
(58, "PCIE Nonfatal Error Message Received"),
(59, "PCIE Correctable Error Message Received"),
]
#Mnemonic RXE_MRG_ESR - Address Offset 0x0E00, phbRxeMrgErrorStatus
rxe_mrg_bits = [
(8, "MRG TMB Allocation Error"),
(9, "MRG TMB Response Invalid"),
(10, "MRG TMB Response Ready Error"),
(11, "MRG MMIO Queue Overflow Error"),
(12, "MRG MMIO Queue Underflow Error"),
(13, "MRG MMIO Internal Parity Error"),
(14, "MRG DMA Queue Overflow Error"),
(15, "MRG DMA Queue Underflow Error"),
(16, "MRG DMA Internal Parity Error"),
(17, "MRG Migration Register Table"),
(18, "MRG Migration Register Table"),
(20, "s_bad_addr_e_q"),
(21, "s_req_size_align_e_q"),
(22, "RRB Parity Error"),
(23, "RRB request Size / Alignment Error"),
(24, "DSP AIB TX Timeout Error"),
(25, "Reserved (vA4.1)"),
(26, "DSP AIB TX CMD Credit Parity Error"),
(28, "DSP AIB TX DAT Credit Parity Error"),
(30, "DSP Command Credit Overflow Error"),
(31, "DSP Command Credit Underflow Error"),
(32, "DSP Command Credit Parity Error"),
(33, "DSP Data Credit Overflow Error"),
(34, "DSP Data Credit Underflow Error"),
(35, "DSP Data Credit Parity Error"),
(36, "DSP Completion State Machine One-Hot Error"),
(37, "DSP Write Thread State Machine One-Hot Error"),
(38, "DSP DMA Secure Address Error (vA4.2)"),
(39, "DSP MSI Interrupt Notification Secure Address"),
(40, "DSP TREQ ECC Correctable Error"),
(41, "DSP TREQ ECC Uncorrectable Error"),
(42, "DSP MMIO Queue Overflow Error"),
(43, "DSP MMIO Queue Underflow Error"),
(44, "DSP MMIO Internal Parity Error"),
(45, "DSP DMA Queue Overflow Error"),
(46, "DSP DMA Queue Underflow Error"),
(47, "DSP DMA Internal Parity Error"),
(48, "DSP Read Thread State Machine One-Hot Error"),
(49, "DSP Table State Machine One-Hot Error"),
(50, "DSP NBW State Machine One-Hot Error"),
(51, "DSP TSM PEST BAR Disabled Error"),
(56, "IPD ECC Correctable Error"),
(57, "IPD ECC Uncorrectable Error"),
(58, "ICPLD ECC Correctable Error"),
(59, "ICPLD ECC Uncorrectable Error"),
(60, "NBWD ECC Correctable Error"),
(61, "NBWD ECC Uncorrectable Error"),
(63, "pb_etu_ai_rx_raise_fence"),
]
# Mnemonic RXE_TCE_ESR - Address Offset 0x0E80 - phbRxeTceErrorStatus
rxe_tce_bits = [
(0, "TCE CMP Internal Parity Error"),
(1, "TCE Request Page Access Error"),
(2, "TCE Response Page Access Error"),
(3, "TCE CMP Queue Overflow"),
(4, "TCE CMP Queue Underflow"),
(5, "TCE Secure Address Error"),
(6, "TCE Cache Bad State Error"),
(7, "TCE Cache Multi-Way Hit Error"),
(8, "TCE Request Timeout Error"),
(9, "TCE TCR ECC Correctable Error"),
(10, "TCE TCR ECC Uncorrectable Error"),
(11, "TCE TDR ECC Correctable Error"),
(12, "TCE TDR ECC Uncorrectable Error"),
(13, "TCE Unexpected Response Error"),
(14, "RRB Parity Error"),
(15, "RRB request Size / Alignment Error"),
(16, "TCE RES Internal Parity Error"),
(17, "s_bad_addr_e_q"),
(18, "s_req_size_align_e_q"),
(19, "TCE RES Queue Overflow"),
(20, "TCE RES Queue Underflow"),
(21, "TCE Response Data Parity Error"),
(22, "TCE TCLB CAM Bad State Error"),
(23, "TCE TCLB CAM Multi-Hit Error"),
(24, "TCE Kill Internal Parity Error"),
(25, "TCE THASH Array ECC Correctable Error"),
(26, "TCE THASH Array ECC Uncorrectable Error"),
(27, "TCE TCLB TDAT ECC Correctable Error"),
(28, "TCE TCLB TDAT ECC Uncorrectable Error"),
(29, "TCE Kill State Machine One-Hot Error"),
(30, "TCE Kill Queue Overflow"),
(31, "TCE Kill Queue Underflow"),
(32, "TCE Request Secure Address Register"),
(33, "TCE Response Secure Address Register"),
]
#Mnemonic PBL_ESR - Address Offset 0x1900 - phbPblErrorStatus
pbl_esr_bits = [
(0, "pb_err_p_fe_tlif_rx_par_e Parity error detected on TLIF Receive interface."),
(1, "pb_err_p_fe_tlif_tx_par_e Parity error detected on TLIF Transmit interface."),
(2, "pb_err_p_fe_blif_out_par_e"),
(3, "pb_err_p_fe_blif_in_par_e"),
(4, "pb_err_p_fe_int_par_e"),
(5, "pb_err_p_fe_toc_cred_e"),
(6, "pb_err_p_fe_ocf_par_e"),
(7, "pb_err_p_fe_ocf_prot_e"),
(12, "pb_err_p_fe_pct_erq_overflow_e"),
(13, "pb_err_p_fe_pct_erq_underflow_e"),
(14, "pb_err_p_fe_pct_onp_tags_rls_unused_e"),
(15, "pb_err_p_fe_pct_onp_tags_used_notfree_e"),
(16, "pb_err_p_fe_pct_onp_tags_used_unexp_e"),
(17, "pb_err_p_fe_bct_onp_tags_rls_unused_e"),
(18, "pb_err_p_fe_bct_onp_tags_used_notfree_e"),
(19, "pb_err_p_fe_ib_bct_rd_inv"),
(20, "pb_err_p_fe_ob_buffer_overflow_e"),
(21, "pb_err_p_fe_ob_buffer_underflow_e"),
(22, "pb_err_p_fe_ib_buffer_overflow_e"),
(23, "pb_err_p_fe_ib_buffer_underflow_e"),
(24, "pb_err_p_fe_ib_d_ecc_ue"),
(25, "pb_err_p_fe_ib_h_ecc_ue"),
(26, "pb_err_p_fe_ob_d_ecc_ue"),
(27, "pb_err_p_fe_ob_h_ecc_ue"),
(28, "pb_err_p_fe_ocf_ecc_ue"),
(32, "pb_err_p_fe_tx_pst_discard_e"),
(33, "pb_err_p_inf_tx_npst_discard_e"),
(34, "pb_err_p_fe_nbw_tlp_e"),
(36, "pb_err_p_fe_pci_rcv_cpl_ca_e"),
(37, "pb_err_p_fe_pci_rcv_cpl_crs_e"),
(38, "pb_err_p_fe_pci_rcv_cpl_rsvd_e"),
(39, "pb_err_p_fe_pci_rcv_cpl_ur_e"),
(40, "pb_err_p_fe_pci_rcv_ecrc_e"),
(41, "pb_err_p_fe_pci_rcv_malf_tlp_e"),
(42, "pb_err_p_fe_pci_rcv_overflow_e"),
(43, "pb_err_p_fe_pci_rcv_poisoned_tlp_e"),
(44, "pb_err_p_fe_pci_rcv_unexp_cpl_e"),
(45, "pb_err_p_fe_pci_rcv_unsup_req_e"),
(46, "pb_err_p_fe_pci_sig_cpl_abort_e"),
(47, "pb_err_p_fe_pci_sig_cpl_timeout_e"),
(48, "pb_err_p_fe_pci_sig_poisoned_tlp_e"),
(52, "pb_err_p_inf_out_trans_to_pst_e"),
(53, "pb_err_p_inf_out_trans_to_npst_e"),
(54, "pb_err_p_inf_out_trans_to_cpl_e"),
(56, "pb_err_p_inf_ib_d_ecc_ce"),
(57, "pb_err_p_inf_ib_h_ecc_ce"),
(58, "pb_err_p_inf_ob_d_ecc_ce"),
(59, "pb_err_p_inf_ob_h_ecc_ce"),
(60, "pb_err_p_inf_ocf_ecc_ce"),
(62, "PBL Bad Register Address Error"),
(63, "PBL Register Parity Error"),
]
# Mnemonic REGB_ESR - Address Offset 0x1C00 - phbRegbErrorStatus
regb_esr_bits = [
(0, "REGB Internal Register Parity Error"),
(1, "PBL Internal Register Parity Error"),
(2, "Invalid Address Decode Error"),
(3, "Register Access Invalid Address+Size Error"),
(5, "Register State Machine or Other Internal Error"),
(6, "PCI CFG Core Registers Parity Error"),
(7, "Register access to CFG core while in reset error."),
(8, "PCIE Link Down"),
(9, "PCIE Link Up"),
(10, "PCIE Link Auto Bandwidth Event Status"),
(11, "PCIE Link BW Management Event Status"),
(25, "PBL Error Trap: INF Error"),
(26, "PBL Error Trap: ERC Error"),
(27, "PBL Error Trap: FAT Error"),
(28, "tldlpo_dl_mon_rxreceivererror(0)"),
(29, "tldlpo_dl_mon_rxreceivererror(1)"),
(30, "tldlpo_dl_mon_rxreceivererror(2)"),
(32, "DL_EC08_BADDLLP"),
(33, "DL_EC08_BADTLP"),
(34, "DL_EC08_DLLPE"),
(35, "DL_EC08_RECEIVERERROR"),
(36, "DL_EC08_ REPLAYROLLOVER"),
(37, "DL_EC08_REPLAYTIMEOUT"),
(39, "DL_INTERNALERROR"),
(40, "DL_LB_ERROR"),
(41, "DL_RX_MALFORMED"),
(42, "DL_RX_NULLIFY"),
(43, "DL_RX_OVERFLOW"),
(44, "DL_TX_CORRERROR"),
(45, "DL_TX_UNCORRERROR"),
(46, "TL_EC08_FCPE"),
(48, "Replay ECC Correctable Error (CE)"),
(49, "Replay ECC UnCorrectable Error (UE)"),
(50, "Bad DLLP Error Count Saturated"),
(51, "Bad TLP Error Count Saturated"),
(52, "Receiver Error Count Saturated"),
(53, "DLLPE Error Count Saturated"),
(58, "pbl_ptl_dl_al_rx_initcredit_p_e"),
(59, "pbl_ptl_dl_al_rx_updatecredit_p_e"),
(60, "PTL Core DLIF Protocol Error"),
(61, "PTL Core TLIF Protocol Error"),
(62, "PTL Core Internal Parity Error"),
]
# FIXME: use the long desc
nfir_bits = [
(0, "bar_pe"), # One of the BARs or BAR Mask Register parity error.
(1, "nonbar_pe"), # Any non-BAR parity error.
(2, "PB_to_PEC_ce"), # ECC correctable error off of outbound SMP interconnect.
(3, "PB_to_PEC_ue"), # ECC uncorrectable error off of outbound SMP interconnect.
(4, "PB_to_PEC_sue"), # ECC special uncorrectable error off of outbound SMP interconnect
(5, "ary_ecc_ce"), # ECC correctable error on an internal array.
(6, "ary_ecc_ue"), # ECC uncorrectable error on an internal array.
(7, "ary_ecc_sue"), # ECC special uncorrectable error on an internal array.
(8, "register_array_pe"), # Parity error on an internal register file.
(9, "pb_interface_pe"), # Parity error on the PB interface (address/aTag/tTag/rTAG).
(10, "pb_data_hang_errors"), # Any SMP interconnect data hang poll error (only checked for CI stores).
(11, "pb_hang_errors"), # Any SMP interconnect command hang error (domestic address range).
(12, "rd_are_errors"), # SMP interconnect address error (ARE) detected by a DMA read.
(13, "nonrd_are_errors"), # SMP interconnect address error detected by a DMA write or an interrupt engine.
(14, "pci_hang_error"), # PBCQ detected that the PCI load, store, EOI, or DMA read response did not make forward progress.
(15, "pci_clock_error"), # PBCQ has detected that the PCI clock has stopped.
(16, "PFIR_freeze"), # This is the freeze signal from the PFIR freeze output.
(17, "hw_errors"), # Any miscellaneous hardware error.
(18, "UnsolicitiedPBData"), # The PEC received data with an rTAG matching a queue that was not expecting data or too much data was received.
(19, "UnExpectedCResp"), # PEC received an unexpected combined response.
(20, "InvalidCResp"), # PEC received an invalid combined response.
(21, "PBUnsupportedSize"), # PEC received a CI load/store that hits a BAR but is an unsupported size or address alignment.
]
pfir_bits = [
(0, "register_pe"), # PBAIB register parity error.
(1, "hardware_error"), # Hardware error.
(2, "AIB_intf_error"), # AIB interface error.
(3, "ETU_Reset_error"), # ETU reset error.
(4, "PEC_scom_error"), # Common PEC SCOM error.
(5, "scomfir_error0"), # SCOM Error bit 0
(6, "scomfir_error1"), # SCOM Error bit 1
]
class PHBError:
reg_bits = {
"NEST FIR": nfir_bits,
"PCI FIR": pfir_bits,
"phbErrorStatus": phb_esr_bits,
"phbTxeErrorStatus": txe_esr_bits,
"phbRxeArbErrorStatus": rxe_arb_bits,
"phbRxeMrgErrorStatus": rxe_mrg_bits,
"phbRxeTceErrorStatus": rxe_tce_bits,
"phbRegbErrorStatus": regb_esr_bits,
"phbPblErrorStatus": pbl_esr_bits,
}
def __str__(self):
s = ""
for k, v in self.regs.items():
s += "{:30s} - {:#018x} - {}\n".format(k, v, ppc.setbits(v))
return s
def __init__(self, timestamp = 0):
self.timestamp = timestamp
self.pest = []
self.regs = {}
# NB: Value is a str, FIXME: Work out how to use python's type annotations
def set_reg(self, reg, value):
reg = reg.replace(" ", "")
if not self.regs.get(reg):
self.regs[reg] = value
return True
return False
def get_reg(self, reg):
reg = reg.replace(" ", "")
v = self.regs.get(reg)
if v:
return v
return 0
# NB: pest entries should be inserted in sort order, but it might be a good
# idea to explicitly sort them by PE number
def set_pest(self, pe, pesta, pestb):
self.pest.append((pe, pesta, pestb))
def get_pest(self, pe_number):
for pe, a, b in self.pest:
if pe == pe_number:
return (a, b)
return None
def header(self):
return self.timestamp
# TODO: move the formatting out of here and into the main loop
def show_errs(self):
out = ""
for reg_name,reg_bits in self.reg_bits.items():
reg_value = self.get_reg(reg_name)
parts = reg_name.split("Error");
if len(parts) > 1:
first_name = "{:s}FirstError{:s}".format(parts[0], parts[1])
first_value = self.get_reg(first_name)
# skiboot spells it wrong, so check Frst too
if first_value == 0:
frst_name = "{:s}FrstError{:s}".format(parts[0], parts[1])
first_value = self.get_reg(frst_name)
else:
first_value = 0
if reg_value == 0:
continue
out += "{} = {:016x}:\n".format(reg_name, reg_value);
for bit in reg_bits:
if ppc.ppcbit(bit[0]) & reg_value:
bang = "!" if (ppc.ppcbit(bit[0]) & reg_value & first_value) == ppc.ppcbit(bit[0]) else ""
out += "{:s}\t{:2d} - {}\n".format(bang, bit[0], bit[1])
out += "\n"
if len(self.pest) == 0:
return out
out += "PEST entries:\n"
for pe, pesta, pestb in self.pest:
out += "\tPEST[{:03x}] = {:016x} {:016x}\n".format(pe, pesta, pestb)
return out
def parse_opal_log(log_text):
# Patterns to match:
#
# [ 938.249526636,3] PHB#0030[8:0]: NEST FIR WOF=0000800000000000
# [ 938.250657886,3] PHB#0030[8:0]: slotStatus = 00402000
# [ 938.254305278,3] PHB#0030[8:0]: PEST[511] = 3740002a01000000 0000000000000000
#
phblog_re = re.compile("" +
"^\[\s*[\d.,]+] " + # skiboot log header
"(PHB#....\[.:.]):" + # PHB name
"\s+" + # whitespace between the PHB and register name
"([^:=]+)" + # register name, NB: this might have some trailing WS
"=\s*" + # the '=' seperating name and value, along with the whitespace
"([a-fA-F\d ]+)") # register value(s)
# this alone isn't really sufficent. There's a few cases that can cause a register
# dump to be generated (e.g. when the link is retrained we do a reg dump)
new_log_marker = re.compile("" +
"^\[ [\d.,]+] " +
"(PHB#....\[.:.]): " +
"PHB Freeze/Fence detected !")
# Store the current register set for each PHB. Keep in mind that we can have register
# dumps from different PHBs being interleaved in the register log.
current = {}
# list discovered error logs
error_logs = []
# Match things and split them on a per-PHB basis. We can get multiple PHB error logs
# printed interleaved in the skiboot log if there are multiple PHBs frozen.
for l in log_text.split("\n"):
m = new_log_marker.match(l)
if not m:
m = phblog_re.match(l)
if not m:
continue
match = m.groups()
phb = match[0]
# new log marker, save the current log and create a new one to store register values in
log = current.get(phb)
if not log:
current[phb] = PHBError(l);
elif len(match) == 1:
error_logs.append(current[phb])
current[phb] = PHBError(l) # create a new log object
log = current[phb]
if len(match) > 1:
if match[1].find("PEST") >= 0: # PEST entry
# NB: unlike .match() .search() scans the whole string
m = re.search("PEST\[([\da-fA-F]+)] = ([\da-fA-F]+) ([\da-fA-F]+)", l)
pe, pesta, pestb = [int(i, 16) for i in m.groups()]
current[phb].set_pest(pe, pesta, pestb)
else: # Normal register
name = match[1].strip()
value = int(match[2].strip(), 16)
ok = current[phb].set_reg(name, value)
# If we have duplicate registers then we're in a new log context
# so stash the current one and init a new one.
if not ok:
error_logs.append(current[phb])
current[phb] = PHBError(l)
current[phb].set_reg(name, value)
# save all the logs we're still processing
for k,v in current.items():
error_logs.append(v)
return error_logs
'''
Mar 25 10:01:49 localhost kernel: PHB4 PHB#48 Diag-data (Version: 1)
Mar 25 10:01:49 localhost kernel: brdgCtl: 00000002
Mar 25 10:01:49 localhost kernel: RootSts: 00010020 00402000 a1030008 00100107 00002000
Mar 25 10:01:49 localhost kernel: RootErrSts: 00000000 00000000 00000001
Mar 25 10:01:49 localhost kernel: PhbSts: 0000001c00000000 0000001c00000000
Mar 25 10:01:49 localhost kernel: Lem: 0000000100280000 0000000000000000 0000000100000000
Mar 25 10:01:49 localhost kernel: PhbErr: 0000088000000000 0000008000000000 2148000098000240 a008400000000000
Mar 25 10:01:49 localhost kernel: RxeArbErr: 4000200000000000 0000200000000000 02409fde30000000 0000000000000000
Mar 25 10:01:49 localhost kernel: PblErr: 0000000001000000 0000000001000000 0000000000000000 0000000000000000
Mar 25 10:01:49 localhost kernel: PcieDlp: 0000000000000000 0000000000000000 ffff000000000000
Mar 25 10:01:49 localhost kernel: RegbErr: 0000004a10000800 0000000810000000 8800003c00000000 0000000007011000
Mar 25 10:01:49 localhost kernel: PE[1fd] A/B: a440002a05000000 8000000000000000
'''
def parse_kernel_log(log_text):
reg8 = "([0-9a-fA-F]{8})"
reg16 = "([0-9a-fA-F]{16})"
# TODO: pick up the AER stuff the kernel logs too?
# NB: The register names used for set_reg are the skiboot register names, not the kernel.
# TODO: check these for completeness / accuracy. I might have missed something
register_patterns = [
(re.compile("brdgCtl: {}" .format(reg8)), "brdgCtl"),
(re.compile("RootSts: {} {} {} {} {}".format(reg8, reg8, reg8, reg8, reg8)),
'deviceStatus', 'slotStatus', 'linkStatus', 'devCmdStatus', 'devSecStatus'),
(re.compile("RootErrSts: {} {} {}" .format(reg8, reg8, reg8)),
'rootErrorStatus', 'uncorrErrorStatus', 'corrErrorStatus'),
(re.compile("PhbSts: {} {}" .format(reg16, reg16)), "phbPlssr", "phbCsr"),
(re.compile("nFir: {} {} {}" .format(reg16, reg16, reg16)), "nFir", "nFirMask", "nFirWOF"),
(re.compile("Lem: {} {} {}" .format(reg16, reg16, reg16)), "lemFir", "lemErrorMask", "lemWOF"),
(re.compile("PhbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbErrorStatus", "phbFirstErrorStatus", "phbErrorLog0", "phbErrorLog1"),
(re.compile("PhbTxeErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbPhbTxeErrorStatus", "phbPhbTxeFirstErrorStatus", "phbPhbTxeErrorLog0", "phbTxeErrorLog1"),
(re.compile("RxeArbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbRxeArbErrorStatus", "phbRxeArbFirstErrorStatus", "phbRxeArbErrorLog0", "phbRxeArbErrorLog1"),
(re.compile("RxeMrgErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbRxeMrgErrorStatus", "phbRxeMrgFirstErrorStatus", "phbRxeMrgErrorLog0", "phbRxeMrgErrorLog1"),
(re.compile("RxeTceErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbRxeTceErrorStatus", "phbRxeTceFirstErrorStatus", "phbRxeTceErrorLog0", "phbRxeTceErrorLog1"),
(re.compile("PblErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbPblErrorStatus", "phbPblFirstErrorStatus", "phbPblErrorLog0", "phbPblErrorLog1"),
(re.compile("PcieDlp: {} {} {}" .format(reg16, reg16, reg16)),
"phbPcieDlpErrorLog1", "phbPcieDlpErrorLog2", "phbPcieDlpErrorStatus"),
(re.compile("RegbErr: {} {} {} {}" .format(reg16, reg16, reg16, reg16)),
"phbRegbErrorStatus", "phbRegbFirstErrorStatus", "phbRegbErrorLog0", "phbRegbErrorLog1"),
]
header_pattern = re.compile("PHB4 PHB#[0-9]+ Diag-data") # match header
pe_pattern = re.compile("PE\[{}\] A/B: {} {}".format("([ 0-9a-fA-F]{3})", reg16, reg16)) # the PE number is three hex digits
logs = []
log = PHBError("");
# pretty nasty but since interpreting the kernel logs requires context I
# don't have any better ideas
for l in log_text.split("\n"):
m = header_pattern.search(l)
if m: # start a new log
logs.append(log)
log = PHBError(l)
continue
for p,*names in register_patterns:
m = p.search(l)
if not m:
continue
for name, val in zip(names, m.groups()):
log.set_reg(name, int(val, 16))
break
m = pe_pattern.search(l)
if m:
pe = int(m.groups()[0], 16)
pesta = int(m.groups()[1], 16)
pestb = int(m.groups()[2], 16)
log.set_pest(pe, pesta, pestb)
logs.append(log)
return logs
def main(argv):
if len(argv) < 2:
print("Usage: {} <log file>".format(argv[0]));
return
try:
log_text = open(argv[1]).read();
except Exception as err:
print(err)
sys.exit(1)
logs = parse_opal_log(log_text);
logs.extend(parse_kernel_log(log_text))
for err in logs:
print("==== PHB Register dump found ====")
print("")
print(err.header())
print("")
print(err.show_errs())
if __name__ == "__main__":
main(sys.argv)