blob: 03461373e247d7d67b76297a6d8ae488779ca650 [file] [log] [blame]
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
/*
* Copyright 2019 IBM Corp.
*/
#include <io.h>
#include <xscom.h>
#include <npu3.h>
#include <npu3-regs.h>
#include <nvram.h>
#include <interrupts.h>
#include <xive.h>
#define NPU3LOG(l, npu, fmt, a...) \
prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a)
#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
#define NPU3DEVLOG(l, dev, fmt, a...) \
prlog(l, "NPU[%d:%d:%d]: " fmt, \
(dev)->npu->chip_id, \
(dev)->npu->index, \
(dev)->index, ##a)
#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index,
uint32_t dev_index)
{
struct dt_node *link;
uint32_t phy_lane_mask, ob_chiplet;
link = dt_new_addr(npu, "link", dev_index);
dt_add_property_string(link, "compatible", "ibm,npu-link");
dt_add_property_cells(link, "reg", dev_index);
dt_add_property_cells(link, "ibm,npu-link-index", dev_index);
switch (npu_index) {
case 0:
/* fall through */
case 2:
ob_chiplet = npu_index ? 3 : 0;
switch (dev_index) {
case 0:
phy_lane_mask = PPC_BITMASK32(0, 3);
break;
case 1:
phy_lane_mask = PPC_BITMASK32(13, 16);
break;
case 2:
phy_lane_mask = PPC_BITMASK32(7, 10);
break;
case 3:
phy_lane_mask = PPC_BITMASK32(20, 23);
break;
}
break;
case 1:
switch (dev_index) {
case 0:
ob_chiplet = 1;
phy_lane_mask = PPC_BITMASK32(0, 3);
break;
case 1:
ob_chiplet = 2;
phy_lane_mask = PPC_BITMASK32(0, 3);
break;
case 2:
ob_chiplet = 1;
phy_lane_mask = PPC_BITMASK32(7, 10);
break;
case 3:
ob_chiplet = 2;
phy_lane_mask = PPC_BITMASK32(7, 10);
break;
}
break;
default:
return;
}
dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet);
dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask);
}
static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index)
{
const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 };
struct dt_node *npu;
npu = dt_new_addr(xscom, "npu", npu_base[npu_index]);
dt_add_property_cells(npu, "#size-cells", 0);
dt_add_property_cells(npu, "#address-cells", 1);
dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c);
dt_add_property_string(npu, "compatible", "ibm,power9-npu3");
dt_add_property_cells(npu, "ibm,npu-index", npu_index);
for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++)
npu3_dt_create_link(npu, npu_index, i);
}
/* This can be removed when/if we decide to use HDAT instead */
static bool npu3_dt_create(void)
{
struct proc_chip *chip = next_chip(NULL);
struct dt_node *xscom;
/* npu3 chips only */
if (proc_gen < proc_gen_p9 ||
chip->type == PROC_CHIP_P9_NIMBUS ||
chip->type == PROC_CHIP_P9_CUMULUS)
return false;
dt_for_each_compatible(dt_root, xscom, "ibm,xscom")
for (uint32_t i = 0; i < 3; i++)
npu3_dt_create_npu(xscom, i);
return true;
}
static struct npu3 *npu3_create(struct dt_node *dn)
{
struct npu3 *npu;
struct dt_node *link;
struct npu3_dev *dev;
char *path;
uint32_t i;
npu = zalloc(sizeof(*npu));
assert(npu);
init_lock(&npu->lock);
npu->dt_node = dn;
npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
npu->xscom_base = dt_get_address(dn, 0, NULL);
npu->chip_id = dt_get_chip_id(dn);
assert(get_chip(npu->chip_id));
dt_for_each_compatible(dn, link, "ibm,npu-link") {
i = dt_prop_get_u32(link, "ibm,npu-link-index");
assert(i < NPU3_LINKS_PER_NPU);
dev = &npu->devices[i];
dev->index = i;
dev->npu = npu;
dev->dn = link;
dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy");
dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
dev->proc.status = NPU3_PROC_COMPLETE;
};
path = dt_get_path(dn);
NPU3INF(npu, "Found %s\n", path);
NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base);
free(path);
return npu;
}
struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
enum npu3_dev_type type)
{
uint32_t i = 0;
if (dev)
i = dev->index + 1;
for (; i < NPU3_LINKS_PER_NPU; i++) {
dev = &npu->devices[i];
if (dev->type == type || type == NPU3_DEV_TYPE_ANY)
return dev;
}
return NULL;
}
static void npu3_device_detect_fixup(struct npu3_dev *dev)
{
struct dt_node *dn = dev->dn;
if (dev->type == NPU3_DEV_TYPE_NVLINK) {
dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink");
dev->link_speed = dt_prop_get_u32_def(
dn, "nvidia,link-speed", 0xff);
return;
}
NPU3DEVDBG(dev, "Link type unknown\n");
dt_add_property_strings(dn, "ibm,npu-link-type", "unknown");
}
/*
* We use the indirect method because it uses the same addresses as
* the MMIO offsets (NPU RING)
*/
static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size)
{
uint64_t val;
val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg);
val = SETFIELD(NPU3_MISC_DA_LEN, val, size);
xscom_write(npu->chip_id,
npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR,
val);
}
static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size,
uint64_t val)
{
npu3_scom_sel(npu, reg, size);
xscom_write(npu->chip_id,
npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
val);
}
static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size)
{
uint64_t val;
npu3_scom_sel(npu, reg, size);
xscom_read(npu->chip_id,
npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
&val);
return val;
}
void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val)
{
void *mmio = (void *)npu->regs[0];
if (mmio)
out_be64(mmio + reg, val);
else
npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val);
/* CQ_SM writes should be mirrored in all four blocks */
if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
return;
for (uint32_t i = 1; i < 4; i++)
npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
val);
}
uint64_t npu3_read(struct npu3 *npu, uint64_t reg)
{
void *mmio = (void *)npu->regs[0];
if (mmio)
return in_be64(mmio + reg);
return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B);
}
void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val)
{
void *mmio = (void *)npu->regs[0];
if (mmio)
out_be32(mmio + reg, val);
else
npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B,
(uint64_t)val << 32);
if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
return;
for (uint32_t i = 1; i < 4; i++)
npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
val);
}
uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg)
{
void *mmio = (void *)npu->regs[0];
if (mmio)
return in_be32(mmio + reg);
return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32;
}
static void npu3_misc_config(struct npu3 *npu)
{
struct npu3_dev *dev;
uint32_t typemap = 0;
uint64_t reg, val;
npu3_for_each_nvlink_dev(dev, npu)
typemap |= 0x10 >> dev->index;
reg = NPU3_MCP_MISC_CFG0;
val = npu3_read(npu, reg);
val |= NPU3_MCP_MISC_CFG0_ENABLE_PBUS;
val &= ~NPU3_MCP_MISC_CFG0_ENABLE_SNARF_CPM;
val = SETFIELD(NPU3_MCP_MISC_CFG0_NVLINK_MODE, val, typemap);
val = SETFIELD(NPU3_MCP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
npu3_write(npu, reg, val);
reg = NPU3_SNP_MISC_CFG0;
val = npu3_read(npu, reg);
val |= NPU3_SNP_MISC_CFG0_ENABLE_PBUS;
val = SETFIELD(NPU3_SNP_MISC_CFG0_NVLINK_MODE, val, typemap);
val = SETFIELD(NPU3_SNP_MISC_CFG0_OCAPI_MODE, val, ~typemap);
npu3_write(npu, reg, val);
reg = NPU3_CTL_MISC_CFG2;
val = npu3_read(npu, reg);
val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap);
val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap);
npu3_write(npu, reg, val);
reg = NPU3_DAT_MISC_CFG1;
val = npu3_read(npu, reg);
val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap);
val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap);
npu3_write(npu, reg, val);
}
static void npu3_assign_bars(struct npu3 *npu)
{
struct npu3_dev *dev;
uint64_t addr, size, val;
/* Global MMIO bar (per npu) */
phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size);
val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24);
val |= NPU3_MMIO_BAR_ENABLE;
npu3_write(npu, NPU3_MMIO_BAR, val);
NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20);
npu->regs[0] = addr;
npu->regs[1] = size;
/* NTL bar (per device) */
npu3_for_each_dev(dev, npu) {
phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev),
&addr, &size);
val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16);
val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16));
npu3_write(npu, NPU3_NTL_BAR(dev->index), val);
dev->ntl_bar.addr = addr;
dev->ntl_bar.size = size;
}
/* GENID bar (logically divided per device) */
phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL);
val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19);
npu3_write(npu, NPU3_GENID_BAR, val);
npu3_for_each_dev(dev, npu) {
dev->genid_bar.addr = addr + (dev->index << 16);
dev->genid_bar.size = 64 << 10;
}
}
void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable)
{
struct npu3 *npu = dev->npu;
uint64_t reg, val;
if (dev->ntl_bar.enable == enable) /* No state change */
return;
dev->ntl_bar.enable = enable;
dev->genid_bar.enable = enable;
reg = NPU3_NTL_BAR(dev->index);
val = npu3_read(npu, reg);
val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable);
npu3_write(npu, reg, val);
/*
* Generation IDs are a single space in the hardware but we split them
* per device. Only disable in hardware if every device has disabled.
*/
if (!enable)
npu3_for_each_dev(dev, npu)
if (dev->genid_bar.enable)
return;
reg = NPU3_GENID_BAR;
val = npu3_read(npu, reg);
val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable);
npu3_write(npu, reg, val);
}
static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn)
{
struct npu3 *npu = is->data;
uint32_t level = isn - npu->irq_base;
/* TCE interrupt is used to detect a frozen PE */
if (level == 18)
return IRQ_ATTR_TARGET_OPAL |
IRQ_ATTR_TARGET_RARE |
IRQ_ATTR_TYPE_MSI;
return IRQ_ATTR_TARGET_LINUX;
}
static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn)
{
struct npu3 *npu = is->data;
uint32_t level = isn - npu->irq_base;
if (level != 18) {
NPU3ERR(npu, "Received unknown interrupt %d\n", level);
return;
}
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
}
#define NPU3_IRQ_LEVELS 60
static char *npu3_ipi_name(struct irq_source *is, uint32_t isn)
{
struct npu3 *npu = is->data;
uint32_t level = isn - npu->irq_base;
static const char *names[NPU3_IRQ_LEVELS] = {
[0] = "NDL 0 Stall Event (brick 0)",
[1] = "NDL 0 No-Stall Event (brick 0)",
[2] = "NDL 1 Stall Event (brick 1)",
[3] = "NDL 1 No-Stall Event (brick 1)",
[4] = "NDL 2 Stall Event (brick 2)",
[5] = "NDL 2 No-Stall Event (brick 2)",
[6] = "NDL 3 Stall Event (brick 3)",
[7] = "NDL 3 No-Stall Event (brick 3)",
[8] = "NDL 4 Stall Event (brick 4)",
[9] = "NDL 4 No-Stall Event (brick 4)",
[10] = "NDL 5 Stall Event (brick 5)",
[11] = "NDL 5 No-Stall Event (brick 5)",
[12] = "NTL 0 Event",
[13] = "NTL 1 Event",
[14] = "NTL 2 Event",
[15] = "NTL 3 Event",
[16] = "NTL 4 Event",
[17] = "NTL 5 Event",
[18] = "TCE Event",
[19] = "ATS Event",
[20] = "CQ Event",
[21] = "MISC Event",
[41] = "Memory Controller Event",
[42] = "NDL 6 Stall Event (brick 6)",
[43] = "NDL 6 No-Stall Event (brick 6)",
[44] = "NDL 7 Stall Event (brick 7)",
[45] = "NDL 7 No-Stall Event (brick 7)",
[46] = "NDL 8 Stall Event (brick 8)",
[47] = "NDL 8 No-Stall Event (brick 8)",
[48] = "NDL 9 Stall Event (brick 9)",
[49] = "NDL 9 No-Stall Event (brick 9)",
[50] = "NDL 10 Stall Event (brick 10)",
[51] = "NDL 10 No-Stall Event (brick 10)",
[52] = "NDL 11 Stall Event (brick 11)",
[53] = "NDL 11 No-Stall Event (brick 11)",
[54] = "NTL 6 Event",
[55] = "NTL 7 Event",
[56] = "NTL 8 Event",
[57] = "NTL 9 Event",
[58] = "NTL 10 Event",
[59] = "NTL 11 Event",
};
if (level >= NPU3_IRQ_LEVELS || !names[level])
return strdup("Unknown");
return strdup(names[level]);
}
static const struct irq_source_ops npu3_ipi_ops = {
.attributes = npu3_ipi_attributes,
.interrupt = npu3_ipi_interrupt,
.name = npu3_ipi_name,
};
static void npu3_setup_irqs(struct npu3 *npu)
{
uint64_t reg, val;
uint32_t base;
base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64);
if (base == XIVE_IRQ_ERROR) {
NPU3ERR(npu, "Failed to allocate interrupt sources\n");
return;
}
xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops);
/* Set IPI configuration */
reg = NPU3_MISC_CFG;
val = npu3_read(npu, reg);
val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K);
val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX);
npu3_write(npu, reg, val);
/* Set IRQ base */
reg = NPU3_MISC_INT_BAR;
val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull,
(uint64_t)xive_get_trigger_port(base) >> 12);
npu3_write(npu, reg, val);
npu->irq_base = base;
}
static void npu3_init(struct npu3 *npu)
{
struct npu3_dev *dev;
platform.npu3_device_detect(npu);
npu3_for_each_dev(dev, npu)
npu3_device_detect_fixup(dev);
npu3_misc_config(npu);
npu3_assign_bars(npu);
npu3_setup_irqs(npu);
npu3_init_nvlink(npu);
}
void probe_npu3(void)
{
struct dt_node *dn;
struct npu3 *npu;
if (!npu3_dt_create())
return;
if (!platform.npu3_device_detect) {
prlog(PR_INFO, "NPU: Platform does not support NPU\n");
return;
}
dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") {
npu = npu3_create(dn);
npu3_init(npu);
}
}