| // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later |
| /* |
| * Centaur memory buffer chip |
| * |
| * Copyright 2013-2017 IBM Corp. |
| */ |
| |
| #include <skiboot.h> |
| #include <xscom.h> |
| #include <processor.h> |
| #include <device.h> |
| #include <chip.h> |
| #include <centaur.h> |
| #include <lock.h> |
| #include <fsi-master.h> |
| #include <timebase.h> |
| |
| /* |
| * Centaur chip IDs are using the XSCOM "partID" encoding |
| * described in xscom.h. recap: |
| * |
| * 0b1000.0000.0000.0000.0000.00NN.NCCC.MMMM |
| * N=Node, C=Chip, M=Memory Channel |
| * |
| * We currently use FSI exclusively for centaur access. We can |
| * start using MMIO on Centaur DD2.x when we have a way to handle |
| * machine checks happening inside Sapphire which we don't at the |
| * moment. |
| */ |
| |
| /* Is that correct ? */ |
| #define MAX_CENTAURS_PER_CHIP 8 |
| |
| /* Mark the centaur offline after this many consecutive errors */ |
| #define CENTAUR_ERR_OFFLINE_THRESHOLD 10 |
| |
| /* |
| * FSI2PIB register definitions (this could be moved out if we were to |
| * support FSI master to other chips. |
| */ |
| #define FSI_DATA0_REG 0x1000 |
| #define FSI_DATA1_REG 0x1004 |
| #define FSI_CMD_REG 0x1008 |
| #define FSI_CMD_WR 0x80000000 |
| #define FSI_CMD_RD 0x00000000 |
| #define FSI_ENG_RESET_REG 0x1018 |
| #define FSI_STATUS_REG 0x101c |
| #define FSI_STATUS_ABORT 0x00100000 |
| #define FSI_STATUS_ERRORS 0x00007000 |
| |
| /* Some Centaur XSCOMs we care about */ |
| #define SCAC_CONFIG_REG 0x020115ce |
| #define SCAC_CONFIG_SET 0x020115cf |
| #define SCAC_CONFIG_CLR 0x020115d0 |
| #define SCAC_ENABLE_MSK PPC_BIT(0) |
| |
| #define cent_log(__lev, __c, __fmt, ...) \ |
| prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__) |
| |
| static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur) |
| { |
| int64_t rc; |
| uint32_t stat; |
| |
| rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_STATUS_REG, &stat); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc); |
| return rc; |
| } |
| if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0) |
| return OPAL_SUCCESS; |
| |
| cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat); |
| |
| /* All 1's ? Assume it's gone */ |
| if (stat == 0xffffffffu) { |
| cent_log(PR_ERR, centaur, "Chip appears to be dead !\n"); |
| centaur->valid = false; |
| |
| /* Here, hostboot grabs a pile of FFDC from the FSI layer, |
| * we could do that too ... |
| */ |
| return OPAL_HARDWARE; |
| } |
| |
| /* Here HB prints the GPx registers which I believe are only |
| * in the host (FSI master). We skip that for now, we don't have |
| * a good API to them |
| */ |
| |
| /* Recovery sequence from HostBoot fsiscom.C |
| * if SCOM fails and FSI Master displays "MasterTimeOut" |
| * then 7,6 <covered by FSI driver> |
| * else if SCOM fails and FSI2PIB Status shows PIB abort |
| * then just perform unit reset (6) and wait 1 ms |
| * else (PIB_abort='0' but PIB error is unequal 0) |
| * then just perform unit reset (6) (wait not needed). |
| * |
| * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have |
| * no choice but doing it at the moment but that will have |
| * to be fixed one way or another, possibly by returning some |
| * kind of busy status until the delay is expired. |
| */ |
| rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_ENG_RESET_REG, 0); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n", |
| rc); |
| } |
| return OPAL_HARDWARE; |
| } |
| |
| static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_addr, |
| uint64_t *val) |
| { |
| int64_t rc; |
| uint32_t data0, data1; |
| |
| rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); |
| return rc; |
| } |
| |
| rc = centaur_fsiscom_complete(centaur); |
| if (rc) |
| return rc; |
| |
| rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_DATA0_REG, &data0); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc); |
| return rc; |
| } |
| rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_DATA1_REG, &data1); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc); |
| return rc; |
| } |
| |
| *val = (((uint64_t)data0) << 32) | data1; |
| |
| return OPAL_SUCCESS; |
| } |
| |
| static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr, |
| uint64_t val) |
| { |
| int64_t rc; |
| |
| rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_DATA0_REG, hi32(val)); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc); |
| return rc; |
| } |
| rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_DATA1_REG, lo32(val)); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc); |
| return rc; |
| } |
| rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, |
| centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR); |
| if (rc) { |
| cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); |
| return rc; |
| } |
| |
| return centaur_fsiscom_complete(centaur); |
| } |
| |
| struct centaur_chip *get_centaur(uint32_t part_id) |
| { |
| uint32_t hchip_id, mchan; |
| struct proc_chip *hchip; |
| struct centaur_chip *centaur; |
| |
| if ((part_id >> 28) != 8) { |
| prerror("CENTAUR: Invalid part ID 0x%x\n", part_id); |
| return NULL; |
| } |
| hchip_id = (part_id & 0x0fffffff) >> 4; |
| mchan = part_id & 0xf; |
| |
| hchip = get_chip(hchip_id); |
| if (!hchip) { |
| prerror("CENTAUR: Centaur 0x%x not found on non-existing chip 0%x\n", |
| part_id, hchip_id); |
| return NULL; |
| } |
| if (mchan >= MAX_CENTAURS_PER_CHIP) { |
| prerror("CENTAUR: Centaur 0x%x channel out of bounds !\n", part_id); |
| return NULL; |
| } |
| if (!hchip->centaurs) { |
| prerror("CENTAUR: Centaur 0x%x not found on chip 0%x (no centaurs)\n", |
| part_id, hchip_id); |
| return NULL; |
| } |
| centaur = &hchip->centaurs[mchan]; |
| if (!centaur->valid) { |
| prerror("CENTAUR: Centaur 0x%x not valid on chip 0%x\n", |
| part_id, hchip_id); |
| return NULL; |
| } |
| return centaur; |
| } |
| |
| /* |
| * Indirect XSCOM access functions. Copied from xscom.c, at a |
| * latter date, we should merge these properly. |
| */ |
| static void centaur_xscom_handle_ind_error(struct centaur_chip *centaur, |
| uint64_t data, uint64_t pcb_addr, |
| bool is_write) |
| { |
| unsigned int stat = GETFIELD(XSCOM_DATA_IND_ERR, data); |
| bool timeout = !(data & XSCOM_DATA_IND_COMPLETE); |
| |
| /* XXX: Create error log entry ? */ |
| if (timeout) |
| cent_log(PR_ERR, centaur, |
| "inddirect %s timeout, pcb_addr=0x%llx stat=0x%x\n", |
| is_write ? "write" : "read", pcb_addr, stat); |
| else |
| cent_log(PR_ERR, centaur, |
| "indirect %s error, pcb_addr=0x%llx stat=0x%x\n", |
| is_write ? "write" : "read", pcb_addr, stat); |
| } |
| |
| static int centaur_xscom_ind_read(struct centaur_chip *centaur, |
| uint64_t pcb_addr, uint64_t *val) |
| { |
| uint32_t addr; |
| uint64_t data; |
| int rc, retries; |
| |
| /* Write indirect address */ |
| addr = pcb_addr & 0x7fffffff; |
| data = XSCOM_DATA_IND_READ | |
| (pcb_addr & XSCOM_ADDR_IND_ADDR); |
| rc = centaur_fsiscom_write(centaur, addr, data); |
| if (rc) |
| goto bail; |
| |
| /* Wait for completion */ |
| for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { |
| rc = centaur_fsiscom_read(centaur, addr, &data); |
| if (rc) |
| goto bail; |
| if ((data & XSCOM_DATA_IND_COMPLETE) && |
| ((data & XSCOM_DATA_IND_ERR) == 0)) { |
| *val = data & XSCOM_DATA_IND_DATA; |
| break; |
| } |
| if ((data & XSCOM_DATA_IND_COMPLETE) || |
| (retries >= XSCOM_IND_MAX_RETRIES)) { |
| centaur_xscom_handle_ind_error(centaur, data, pcb_addr, |
| false); |
| rc = OPAL_HARDWARE; |
| goto bail; |
| } |
| } |
| bail: |
| if (rc) |
| *val = (uint64_t)-1; |
| return rc; |
| } |
| |
| static int centaur_xscom_ind_write(struct centaur_chip *centaur, |
| uint64_t pcb_addr, uint64_t val) |
| { |
| uint32_t addr; |
| uint64_t data; |
| int rc, retries; |
| |
| /* Write indirect address & data */ |
| addr = pcb_addr & 0x7fffffff; |
| data = pcb_addr & XSCOM_ADDR_IND_ADDR; |
| data |= val & XSCOM_ADDR_IND_DATA; |
| |
| rc = centaur_fsiscom_write(centaur, addr, data); |
| if (rc) |
| goto bail; |
| |
| /* Wait for completion */ |
| for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { |
| rc = centaur_fsiscom_read(centaur, addr, &data); |
| if (rc) |
| goto bail; |
| if ((data & XSCOM_DATA_IND_COMPLETE) && |
| ((data & XSCOM_DATA_IND_ERR) == 0)) |
| break; |
| if ((data & XSCOM_DATA_IND_COMPLETE) || |
| (retries >= XSCOM_IND_MAX_RETRIES)) { |
| centaur_xscom_handle_ind_error(centaur, data, pcb_addr, |
| true); |
| rc = OPAL_HARDWARE; |
| goto bail; |
| } |
| } |
| bail: |
| return rc; |
| } |
| |
| static int64_t centaur_xscom_read(struct scom_controller *scom, |
| uint32_t id __unused, uint64_t pcb_addr, |
| uint64_t *val) |
| { |
| struct centaur_chip *centaur = scom->private; |
| int64_t rc; |
| |
| if (!centaur) |
| return OPAL_PARAMETER; |
| if (!centaur->online) |
| return OPAL_XSCOM_CTR_OFFLINED; |
| |
| lock(¢aur->lock); |
| if (pcb_addr & XSCOM_ADDR_IND_FLAG) |
| rc = centaur_xscom_ind_read(centaur, pcb_addr, val); |
| else |
| rc = centaur_fsiscom_read(centaur, pcb_addr, val); |
| |
| /* We mark the centaur offline if we get too many errors on |
| * consecutive accesses |
| */ |
| if (rc) { |
| centaur->error_count++; |
| if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) { |
| centaur->online = false; |
| /** |
| * @fwts-label CentaurOfflinedTooManyErrors |
| * @fwts-advice OPAL marked a Centaur (memory buffer) |
| * as offline due to CENTAUR_ERR_OFFLINE_THRESHOLD (10) |
| * consecutive errors on XSCOMs to this centaur. |
| * OPAL will now return OPAL_XSCOM_CTR_OFFLINED and not |
| * try any further XSCOMs. This is likely caused by |
| * some hardware issue or PRD recovery issue. |
| */ |
| prlog(PR_ERR, "CENTAUR: Offlined %x due to > %d consecutive XSCOM errors. No more XSCOMs to this centaur.\n", |
| id, CENTAUR_ERR_OFFLINE_THRESHOLD); |
| } |
| } else |
| centaur->error_count = 0; |
| unlock(¢aur->lock); |
| |
| return rc; |
| } |
| |
| static int64_t centaur_xscom_write(struct scom_controller *scom, |
| uint32_t id __unused, uint64_t pcb_addr, |
| uint64_t val) |
| { |
| struct centaur_chip *centaur = scom->private; |
| int64_t rc; |
| |
| if (!centaur) |
| return OPAL_PARAMETER; |
| if (!centaur->online) |
| return OPAL_XSCOM_CTR_OFFLINED; |
| |
| lock(¢aur->lock); |
| if (pcb_addr & XSCOM_ADDR_IND_FLAG) |
| rc = centaur_xscom_ind_write(centaur, pcb_addr, val); |
| else |
| rc = centaur_fsiscom_write(centaur, pcb_addr, val); |
| |
| /* We mark the centaur offline if we get too many errors on |
| * consecutive accesses |
| */ |
| if (rc) { |
| centaur->error_count++; |
| if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD) |
| centaur->online = false; |
| } else |
| centaur->error_count = 0; |
| unlock(¢aur->lock); |
| |
| return rc; |
| } |
| |
| static bool centaur_check_id(struct centaur_chip *centaur) |
| { |
| int64_t rc; |
| uint64_t val; |
| |
| rc = centaur_fsiscom_read(centaur, 0xf000f, &val); |
| if (rc) { |
| cent_log(PR_ERR, centaur, |
| " FSISCOM error %lld reading ID register\n", |
| rc); |
| return false; |
| } |
| |
| /* Extract CFAM id */ |
| val >>= 44; |
| |
| /* Identify chip */ |
| if ((val & 0xff) != 0xe9) { |
| cent_log(PR_ERR, centaur, |
| " CFAM ID 0x%02x is not a Centaur !\n", |
| (unsigned int)(val & 0xff)); |
| return false; |
| } |
| |
| /* Get EC level from CFAM ID */ |
| centaur->ec_level = ((val >> 16) & 0xf) << 4; |
| centaur->ec_level |= (val >> 8) & 0xf; |
| |
| return true; |
| } |
| |
| static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng, |
| uint32_t mport) |
| { |
| uint32_t hchip_id, mchan; |
| struct proc_chip *hchip; |
| struct centaur_chip *centaur; |
| |
| if ((part_id >> 28) != 8) { |
| prerror("CENTAUR: Invalid part ID 0x%x\n", part_id); |
| return false; |
| } |
| hchip_id = (part_id & 0x0fffffff) >> 4; |
| mchan = part_id & 0xf; |
| |
| printf("CENTAUR: Found centaur for chip 0x%x channel %d\n", |
| hchip_id, mchan); |
| printf("CENTAUR: FSI host: 0x%x cMFSI%d port %d\n", |
| mchip, meng, mport); |
| |
| hchip = get_chip(hchip_id); |
| if (!hchip) { |
| prerror("CENTAUR: No such chip !!!\n"); |
| return false; |
| } |
| |
| if (mchan >= MAX_CENTAURS_PER_CHIP) { |
| prerror("CENTAUR: Channel out of bounds !\n"); |
| return false; |
| } |
| |
| if (!hchip->centaurs) { |
| hchip->centaurs = |
| zalloc(sizeof(struct centaur_chip) * |
| MAX_CENTAURS_PER_CHIP); |
| assert(hchip->centaurs); |
| } |
| |
| centaur = &hchip->centaurs[mchan]; |
| if (centaur->valid) { |
| prerror("CENTAUR: Duplicate centaur !\n"); |
| return false; |
| } |
| centaur->part_id = part_id; |
| centaur->fsi_master_chip_id = mchip; |
| centaur->fsi_master_port = mport; |
| centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0; |
| centaur->online = true; |
| init_lock(¢aur->lock); |
| list_head_init(¢aur->i2cms); |
| |
| if (!centaur_check_id(centaur)) |
| return false; |
| |
| centaur->scom.part_id = part_id; |
| centaur->scom.private = centaur; |
| centaur->scom.read = centaur_xscom_read; |
| centaur->scom.write = centaur_xscom_write; |
| scom_register(¢aur->scom); |
| |
| cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n", |
| centaur->ec_level >> 4, |
| centaur->ec_level & 0xf); |
| |
| centaur->valid = true; |
| return true; |
| } |
| |
| /* Returns how long to wait for logic to stop in TB ticks or a negative |
| * value on error |
| */ |
| int64_t centaur_disable_sensor_cache(uint32_t part_id) |
| { |
| struct centaur_chip *centaur = get_centaur(part_id); |
| int64_t rc = 0; |
| uint64_t ctrl; |
| |
| if (!centaur) |
| return false; |
| |
| lock(¢aur->lock); |
| centaur->scache_disable_count++; |
| if (centaur->scache_disable_count == 1) { |
| centaur->scache_was_enabled = false; |
| rc = centaur_fsiscom_read(centaur, SCAC_CONFIG_REG, &ctrl); |
| if (rc) |
| goto bail; |
| centaur->scache_was_enabled = !!(ctrl & SCAC_ENABLE_MSK); |
| rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_CLR, SCAC_ENABLE_MSK); |
| if (rc) |
| goto bail; |
| rc = msecs_to_tb(30); |
| } |
| bail: |
| unlock(¢aur->lock); |
| return rc; |
| } |
| |
| int64_t centaur_enable_sensor_cache(uint32_t part_id) |
| { |
| struct centaur_chip *centaur = get_centaur(part_id); |
| int64_t rc = 0; |
| |
| if (!centaur) |
| return false; |
| |
| lock(¢aur->lock); |
| if (centaur->scache_disable_count == 0) { |
| cent_log(PR_ERR, centaur, "Cache count going negative !\n"); |
| backtrace(); |
| goto bail; |
| } |
| centaur->scache_disable_count--; |
| if (centaur->scache_disable_count == 0 && centaur->scache_was_enabled) |
| rc = centaur_fsiscom_write(centaur, SCAC_CONFIG_SET, SCAC_ENABLE_MSK); |
| bail: |
| unlock(¢aur->lock); |
| return rc; |
| } |
| |
| void centaur_init(void) |
| { |
| struct dt_node *cn; |
| |
| dt_for_each_compatible(dt_root, cn, "ibm,centaur") { |
| uint32_t chip_id, mchip, meng, mport; |
| |
| chip_id = dt_prop_get_u32(cn, "ibm,chip-id"); |
| mchip = dt_prop_get_u32(cn, "ibm,fsi-master-chip-id"); |
| meng = dt_prop_get_cell(cn, "ibm,fsi-master-port", 0); |
| mport = dt_prop_get_cell(cn, "ibm,fsi-master-port", 1); |
| |
| /* |
| * If adding the centaur succeeds, we expose it to |
| * Linux as a scom-controller |
| */ |
| if (centaur_add(chip_id, mchip, meng, mport)) |
| dt_add_property(cn, "scom-controller", NULL, 0); |
| } |
| } |