blob: dba7ee50fde819cc2a2163bf23b1eb91017b8ae1 [file] [log] [blame]
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
/*
* NVLink1, supported by the NPU (POWER8)
*
* Copyright 2013-2019 IBM Corp.
*/
#include <skiboot.h>
#include <io.h>
#include <timebase.h>
#include <pci.h>
#include <pci-cfg.h>
#include <pci-virt.h>
#include <pci-slot.h>
#include <interrupts.h>
#include <opal.h>
#include <opal-api.h>
#include <cpu.h>
#include <device.h>
#include <ccan/str/str.h>
#include <ccan/array_size/array_size.h>
#include <ccan/build_assert/build_assert.h>
#include <affinity.h>
#include <npu-regs.h>
#include <npu.h>
#include <xscom.h>
#include <string.h>
/*
* Terminology:
*
* Brick - A group of either 8 TX or 8 RX lanes
* Link - A group of 8 TX and 8 RX lanes
*
* Each link is represented in system software as an emulated PCI
* device. Garrison has two chips each with 4 links, therefore there
* are 8 emulated PCI devices in total.
*
* +----------------------------------------------------------------+
* | PBCQ3 (SCOM Base Address 0x2012c00) |
* | PHB3 (SCOM Base Address 0x9012c00) |
* +----------------------------------------------------------------+
* |||||||| ||||||||
* |||||||| ||||||||
* |||||||| ||||||||
* |||||||| ||||||||
* +----------------------------------------------------------------+
* | PCIe x8 |
* +----------------------------------------------------------------+
* | GPU0 |
* +--------------------------------+-------------------------------+
* | NV Link 1 | NV Link 0 |
* +---------------+----------------+---------------+---------------+
* | RX | TX | RX | TX |
* +---------------+----------------+---------------+---------------+
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* +---------------+----------------+---------------+---------------+
* | TX | RX | TX | RX |
* +---------------+----------------+---------------+---------------+
* | Lanes [0:7] PHY 0 Lanes [8:15] |
* | SCOM Base Address 0x8000080008010c3f |
* +--------------------------------+-------------------------------+
* | Link 0 NDL/NTL | Link 1 NTL/NDL |
* | SCOM Base Address 0x8013c00 | SCOM Base Address 0x8013c40 |
* +--------------------------------+-------------------------------+
* | |
* | Address Translation/AT (shared for all links) |
* | SCOM Base Address 0x8013d80 |
* | |
* +--------------------------------+-------------------------------+
* | Link 3 NDL/NTL | Link 4 NTL/NDL |
* | SCOM Base Address 0x8013d00 | SCOM Base Address 0x8013d40 |
* +--------------------------------+-------------------------------+
* | Lanes [8:15] PHY 1 Lanes [0:7] |
* | SCOM Base Address 0x8000080008010c7f |
* +---------------+----------------+---------------+---------------+
* | TX | RX | TX | RX |
* +---------------+----------------+---------------+---------------+
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* |||||||| |||||||| |||||||| ||||||||
* +---------------+----------------+---------------+---------------+
* | RX | TX | RX | TX |
* +---------------+----------------+---------------+---------------+
* | NV Link 2 | NV Link 3 |
* +--------------------------------+-------------------------------+
* | GPU1 |
* +----------------------------------------------------------------+
* | PCIe x8 |
* +----------------------------------------------------------------+
* |||||||| ||||||||
* |||||||| ||||||||
* |||||||| ||||||||
* |||||||| ||||||||
* +----------------------------------------------------------------+
* | PHB2 (SCOM Base Address 0x9012800) |
* | PBCQ2 (SCOM Base Address 0x2012800) |
* +----------------------------------------------------------------+
*
*/
static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
uint16_t id);
#define OPAL_NPU_VERSION 0x02
#define PCIE_CAP_START 0x40
#define PCIE_CAP_END 0x80
#define VENDOR_CAP_START 0x80
#define VENDOR_CAP_END 0x90
#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
/* Returns the scom base for the given link index */
static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base,
int index)
{
struct dt_node *link;
uint32_t link_index;
char namebuf[32];
snprintf(namebuf, sizeof(namebuf), "link@%x", index);
link = dt_find_by_name(dn, namebuf);
assert(link);
link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
return scom_base + (link_index * NPU_LINK_SIZE);
}
static uint64_t get_bar_size(uint64_t bar)
{
return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000;
}
/* Update the changes of the device BAR to link BARs */
static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar,
bool enable)
{
uint64_t val;
if (!bar->xscom)
return;
val = bar->base;
val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000));
if (enable)
val |= NX_MMIO_BAR_ENABLE;
xscom_write(gcid, bar->xscom, val);
}
/* Trap for PCI command (0x4) to enable or disable device's BARs */
static int64_t npu_dev_cfg_write_cmd(void *dev,
struct pci_cfg_reg_filter *pcrf __unused,
uint32_t offset, uint32_t size,
uint32_t *data, bool write)
{
struct pci_virt_device *pvd = dev;
struct npu_dev *ndev = pvd->data;
bool enable;
if (!write)
return OPAL_PARTIAL;
if (offset != PCI_CFG_CMD)
return OPAL_PARAMETER;
if (size != 1 && size != 2 && size != 4)
return OPAL_PARAMETER;
/* Update device BARs and link BARs will be syncrhonized
* with hardware automatically.
*/
enable = !!(*data & PCI_CFG_CMD_MEM_EN);
npu_dev_bar_update(ndev->npu->chip_id, &ndev->bar, enable);
/* Normal path to update PCI config buffer */
return OPAL_PARTIAL;
}
/*
* Trap for memory BARs: 0xFF's should be written to BAR register
* prior to getting its size.
*/
static int64_t npu_dev_cfg_bar_read(struct npu_dev *dev __unused,
struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t size,
uint32_t *data)
{
struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
/* Revert to normal path if we weren't trapped for BAR size */
if (!bar->trapped)
return OPAL_PARTIAL;
if (offset != pcrf->start &&
offset != pcrf->start + 4)
return OPAL_PARAMETER;
if (size != 4)
return OPAL_PARAMETER;
bar->trapped = false;
*data = bar->bar_sz;
return OPAL_SUCCESS;
}
static int64_t npu_dev_cfg_bar_write(struct npu_dev *dev,
struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t size,
uint32_t data)
{
struct pci_virt_device *pvd = dev->pvd;
struct npu_dev_bar *bar = (struct npu_dev_bar *)(pcrf->data);
uint32_t pci_cmd;
if (offset != pcrf->start &&
offset != pcrf->start + 4)
return OPAL_PARAMETER;
if (size != 4)
return OPAL_PARAMETER;
/* Return BAR size on next read */
if (data == 0xffffffff) {
bar->trapped = true;
if (offset == pcrf->start)
bar->bar_sz = (bar->size & 0xffffffff);
else
bar->bar_sz = (bar->size >> 32);
return OPAL_SUCCESS;
}
/* Update BAR base address */
if (offset == pcrf->start) {
bar->base &= 0xffffffff00000000UL;
bar->base |= (data & 0xfffffff0);
} else {
bar->base &= 0x00000000ffffffffUL;
bar->base |= ((uint64_t)data << 32);
PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd);
npu_dev_bar_update(dev->npu->chip_id, bar,
!!(pci_cmd & PCI_CFG_CMD_MEM_EN));
}
/* We still depend on the normal path to update the
* cached config buffer.
*/
return OPAL_PARAMETER;
}
static int64_t npu_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t len, uint32_t *data,
bool write)
{
struct pci_virt_device *pvd = dev;
struct npu_dev *ndev = pvd->data;
if (write)
return npu_dev_cfg_bar_write(ndev, pcrf, offset, len, *data);
return npu_dev_cfg_bar_read(ndev, pcrf, offset, len, data);
}
static int64_t npu_dev_cfg_exp_devcap(void *dev,
struct pci_cfg_reg_filter *pcrf __unused,
uint32_t offset, uint32_t size,
uint32_t *data, bool write)
{
struct pci_virt_device *pvd = dev;
struct npu_dev *ndev = pvd->data;
assert(write);
if ((size != 2) || (offset & 1)) {
/* Short config writes are not supported */
prlog(PR_ERR, "NPU%d: Unsupported write to pcie control register\n",
ndev->phb->opal_id);
return OPAL_PARAMETER;
}
if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
npu_dev_procedure_reset(ndev);
return OPAL_PARTIAL;
}
static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn)
{
struct pci_virt_device *pvd;
/* Sanity check */
if (bdfn & ~0xff)
return NULL;
pvd = pci_virt_find_device(&p->phb, bdfn);
if (pvd)
return pvd->data;
return NULL;
}
#define NPU_CFG_READ(size, type) \
static int64_t npu_cfg_read##size(struct phb *phb, uint32_t bdfn, \
uint32_t offset, type *data) \
{ \
uint32_t val; \
int64_t ret; \
\
ret = pci_virt_cfg_read(phb, bdfn, offset, sizeof(*data), &val); \
*data = (type)val; \
return ret; \
}
#define NPU_CFG_WRITE(size, type) \
static int64_t npu_cfg_write##size(struct phb *phb, uint32_t bdfn, \
uint32_t offset, type data) \
{ \
uint32_t val = data; \
\
return pci_virt_cfg_write(phb, bdfn, offset, sizeof(data), val); \
}
NPU_CFG_READ(8, u8);
NPU_CFG_READ(16, u16);
NPU_CFG_READ(32, u32);
NPU_CFG_WRITE(8, u8);
NPU_CFG_WRITE(16, u16);
NPU_CFG_WRITE(32, u32);
static int __npu_dev_bind_pci_dev(struct phb *phb __unused,
struct pci_device *pd,
void *data)
{
struct npu_dev *dev = data;
struct dt_node *pci_dt_node;
char *pcislot;
/* Ignore non-nvidia PCI devices */
if ((pd->vdid & 0xffff) != 0x10de)
return 0;
/* Find the PCI device's slot location */
for (pci_dt_node = pd->dn;
pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label");
pci_dt_node = pci_dt_node->parent);
if (!pci_dt_node)
return 0;
pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label");
prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n",
pcislot, dev->slot_label);
if (streq(pcislot, dev->slot_label))
return 1;
return 0;
}
static void npu_dev_bind_pci_dev(struct npu_dev *dev)
{
struct phb *phb;
uint32_t i;
if (dev->pd)
return;
for (i = 0; i < 64; i++) {
if (dev->npu->phb.opal_id == i)
continue;
phb = pci_get_phb(i);
if (!phb)
continue;
dev->pd = pci_walk_dev(phb, NULL, __npu_dev_bind_pci_dev, dev);
if (dev->pd) {
dev->phb = phb;
/* Found the device, set the bit in config space */
PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START +
VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01);
return;
}
}
prlog(PR_INFO, "%s: No PCI device for NPU device %04x:%02x:%02x.%x to bind to. If you expect a GPU to be there, this is a problem.\n",
__func__, dev->npu->phb.opal_id,
dev->pvd->bdfn >> 8 & 0xff,
dev->pvd->bdfn >> 3 & 0x1f,
dev->pvd->bdfn & 0x7);
}
static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
/* Appends an NPU phandle to the given PCI device node ibm,npu
* property */
static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle)
{
uint32_t *npu_phandles;
struct dt_property *pci_npu_phandle_prop;
size_t prop_len;
/* Use a lock to make sure no one else has a reference to an
* ibm,npu property (this assumes this is the only function
* that holds a reference to it). */
lock(&pci_npu_phandle_lock);
/* This function shouldn't be called unless ibm,npu exists */
pci_npu_phandle_prop = (struct dt_property *)
dt_require_property(dn, "ibm,npu", -1);
/* Need to append to the properties */
prop_len = pci_npu_phandle_prop->len;
prop_len += sizeof(*npu_phandles);
dt_resize_property(&pci_npu_phandle_prop, prop_len);
npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop;
npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle;
unlock(&pci_npu_phandle_lock);
}
static int npu_dn_fixup(struct phb *phb,
struct pci_device *pd,
void *data __unused)
{
struct npu *p = phb_to_npu(phb);
struct npu_dev *dev;
dev = bdfn_to_npu_dev(p, pd->bdfn);
assert(dev);
if (dev->phb || dev->pd)
return 0;
/* NPU devices require a slot location to associate with GPUs */
dev->slot_label = dt_prop_get(pd->dn, "ibm,slot-label");
/* Bind the emulated PCI device with the real one, which can't
* be done until the PCI devices are populated. Once the real
* PCI device is identified, we also need fix the device-tree
* for it
*/
npu_dev_bind_pci_dev(dev);
if (dev->phb && dev->pd && dev->pd->dn) {
if (dt_find_property(dev->pd->dn, "ibm,npu"))
npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle);
else
dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle);
dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle);
}
return 0;
}
static void npu_phb_final_fixup(struct phb *phb)
{
pci_walk_dev(phb, NULL, npu_dn_fixup, NULL);
}
static void npu_ioda_init(struct npu *p)
{
uint64_t *data64;
uint32_t i;
/* LXIVT - Disable all LSIs */
for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
data64 = &p->lxive_cache[i];
*data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff);
*data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0);
}
/* PCT - Reset to reserved PE# */
for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) {
data64 = &p->pce_cache[i];
*data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, 0ul);
*data64 |= NPU_IODA_PCT_LINK_ENABLED;
}
/* Clear TVT */
memset(p->tve_cache, 0, sizeof(p->tve_cache));
}
static int64_t npu_ioda_reset(struct phb *phb, bool purge)
{
struct npu *p = phb_to_npu(phb);
uint32_t i;
if (purge) {
NPUDBG(p, "Purging all IODA tables...\n");
npu_ioda_init(p);
}
/* LIST */
npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true);
for (i = 0; i < 8; i++)
out_be64(p->at_regs + NPU_IODA_DATA0, 0x1);
/* LIXVT */
npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true);
for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++)
out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]);
/* PCT */
npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true);
for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++)
out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]);
/* TVT */
npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true);
for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]);
return OPAL_SUCCESS;
}
static int npu_isn_valid(struct npu *p, uint32_t isn)
{
if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 ||
NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN ||
NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) {
/**
* @fwts-label NPUisnInvalid
* @fwts-advice NVLink not functional
*/
prlog(PR_ERR, "NPU%d: isn 0x%x not valid for this NPU\n",
p->phb.opal_id, isn);
return false;
}
return true;
}
static int64_t npu_lsi_get_xive(struct irq_source *is, uint32_t isn,
uint16_t *server, uint8_t *prio)
{
struct npu *p = is->data;
uint32_t irq = NPU_IRQ_NUM(isn);
uint64_t lxive;
if (!npu_isn_valid(p, isn))
return OPAL_PARAMETER;
/* The content is fetched from the cache, which requires
* that the initial cache should be initialized with the
* default values
*/
irq -= NPU_LSI_IRQ_MIN;
lxive = p->lxive_cache[irq];
*server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive);
*prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive);
return OPAL_SUCCESS;
}
static int64_t npu_lsi_set_xive(struct irq_source *is, uint32_t isn,
uint16_t server, uint8_t prio)
{
struct npu *p = is->data;
uint32_t irq = NPU_IRQ_NUM(isn);
uint64_t lxive;
if (!npu_isn_valid(p, isn))
return OPAL_PARAMETER;
/* Figure out LXIVT entry */
lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server);
lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
/* Cache LXIVT entry */
irq -= NPU_LSI_IRQ_MIN;
p->lxive_cache[irq] = lxive;
/* Update to LXIVT entry */
npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false);
lxive = in_be64(p->at_regs + NPU_IODA_DATA0);
lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server);
lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
out_be64(p->at_regs + NPU_IODA_DATA0, lxive);
return OPAL_SUCCESS;
}
static void npu_err_interrupt(struct irq_source *is, uint32_t isn)
{
struct npu *p = is->data;
uint32_t irq = NPU_IRQ_NUM(isn);
if (!npu_isn_valid(p, isn))
return;
/* There're 4 LSIs used for error reporting: 4/5 for data
* link error reporting while 6/7 for frozen PE detection
*/
irq -= NPU_LSI_IRQ_MIN;
switch (irq) {
case 4 ... 5:
prerror("Invalid NPU error interrupt received\n");
break;
case 6 ... 7:
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
}
}
static uint64_t npu_lsi_attributes(struct irq_source *is, uint32_t isn)
{
struct npu *p = is->data;
uint32_t idx = isn - p->base_lsi;
if (idx >= 4)
return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_LSI;
return IRQ_ATTR_TARGET_LINUX;
}
/* Error LSIs (skiboot owned) */
static const struct irq_source_ops npu_lsi_irq_ops = {
.get_xive = npu_lsi_get_xive,
.set_xive = npu_lsi_set_xive,
.attributes = npu_lsi_attributes,
.interrupt = npu_err_interrupt,
};
static void npu_register_irq(struct npu *p)
{
register_irq_source(&npu_lsi_irq_ops, p, p->base_lsi, 8);
}
static void npu_hw_init(struct npu *p)
{
/* 3 MMIO setup for AT */
out_be64(p->at_regs + NPU_LSI_SOURCE_ID,
SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, NPU_LSI_IRQ_MIN >> 4));
BUILD_ASSERT((NPU_LSI_IRQ_MIN & 0x07F0) == NPU_LSI_IRQ_MIN);
out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul);
npu_ioda_reset(&p->phb, false);
}
static int64_t npu_map_pe_dma_window_real(struct phb *phb,
uint64_t pe_number,
uint16_t window_id,
uint64_t pci_start_addr,
uint64_t pci_mem_size)
{
struct npu *p = phb_to_npu(phb);
uint64_t end;
uint64_t tve;
/* Sanity check. Each PE has one corresponding TVE */
if (pe_number >= NPU_NUM_OF_PES ||
window_id != pe_number)
return OPAL_PARAMETER;
if (pci_mem_size) {
/* Enable */
end = pci_start_addr + pci_mem_size;
/* We have to be 16M aligned */
if ((pci_start_addr & 0x00ffffff) ||
(pci_mem_size & 0x00ffffff))
return OPAL_PARAMETER;
/*
* It *looks* like this is the max we can support (we need
* to verify this. Also we are not checking for rollover,
* but then we aren't trying too hard to protect ourselves
* againt a completely broken OS.
*/
if (end > 0x0003ffffffffffffull)
return OPAL_PARAMETER;
/*
* Put start address bits 49:24 into TVE[52:53]||[0:23]
* and end address bits 49:24 into TVE[54:55]||[24:47]
* and set TVE[51]
*/
tve = (pci_start_addr << 16) & (0xffffffull << 48);
tve |= (pci_start_addr >> 38) & (3ull << 10);
tve |= (end >> 8) & (0xfffffful << 16);
tve |= (end >> 40) & (3ull << 8);
tve |= PPC_BIT(51);
} else {
/* Disable */
tve = 0;
}
npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
out_be64(p->at_regs + NPU_IODA_DATA0, tve);
p->tve_cache[window_id] = tve;
return OPAL_SUCCESS;
}
static int64_t npu_map_pe_dma_window(struct phb *phb,
uint64_t pe_number,
uint16_t window_id,
uint16_t tce_levels,
uint64_t tce_table_addr,
uint64_t tce_table_size,
uint64_t tce_page_size)
{
struct npu *p = phb_to_npu(phb);
uint64_t tts_encoded;
uint64_t data64 = 0;
/* Sanity check. Each PE has one corresponding TVE */
if (pe_number >= NPU_NUM_OF_PES ||
window_id != pe_number)
return OPAL_PARAMETER;
/* Special condition, zero TCE table size used to disable
* the TVE.
*/
if (!tce_table_size) {
npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
out_be64(p->at_regs + NPU_IODA_DATA0, 0ul);
p->tve_cache[window_id] = 0ul;
return OPAL_SUCCESS;
}
/* Additional arguments validation */
if (tce_levels < 1 ||
tce_levels > 4 ||
!is_pow2(tce_table_size) ||
tce_table_size < 0x1000)
return OPAL_PARAMETER;
/* TCE table size */
data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12);
tts_encoded = ilog2(tce_table_size) - 11;
if (tts_encoded > 39)
return OPAL_PARAMETER;
data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded);
/* TCE page size */
switch (tce_page_size) {
case 0x10000: /* 64K */
data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5);
break;
case 0x1000000: /* 16M */
data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13);
break;
case 0x10000000: /* 256M */
data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17);
break;
case 0x1000: /* 4K */
default:
data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1);
}
/* Number of levels */
data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1);
/* Update to hardware */
npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
out_be64(p->at_regs + NPU_IODA_DATA0, data64);
p->tve_cache[window_id] = data64;
return OPAL_SUCCESS;
}
static int64_t npu_set_pe(struct phb *phb,
uint64_t pe_number,
uint64_t bdfn,
uint8_t bcompare,
uint8_t dcompare,
uint8_t fcompare,
uint8_t action)
{
struct npu *p = phb_to_npu(phb);
struct npu_dev *dev;
uint32_t link_idx;
uint64_t *data64;
/* Sanity check */
if (action != OPAL_MAP_PE &&
action != OPAL_UNMAP_PE)
return OPAL_PARAMETER;
if (pe_number >= NPU_NUM_OF_PES)
return OPAL_PARAMETER;
/* All emulated PCI devices hooked to root bus, whose
* bus number is zero.
*/
dev = bdfn_to_npu_dev(p, bdfn);
if (PCI_BUS_NUM(bdfn) || !dev)
return OPAL_PARAMETER;
link_idx = dev->index;
dev->pe_number = pe_number;
/* Separate links will be mapped to different PEs */
if (bcompare != OpalPciBusAll ||
dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
return OPAL_UNSUPPORTED;
/* Map the link to the corresponding PE */
data64 = &p->pce_cache[link_idx];
if (action == OPAL_MAP_PE)
*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
pe_number);
else
*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
NPU_NUM_OF_PES);
*data64 |= NPU_IODA_PCT_LINK_ENABLED;
npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false);
out_be64(p->at_regs + NPU_IODA_DATA0, *data64);
return OPAL_SUCCESS;
}
static int64_t npu_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
{
/* As we're emulating all PCI stuff, the link bandwidth
* isn't big deal anyway.
*/
*val = OPAL_SHPC_LINK_UP_x1;
return OPAL_SUCCESS;
}
static int64_t npu_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
{
*val = PCI_SLOT_POWER_ON;
return OPAL_SUCCESS;
}
static int64_t npu_hreset(struct pci_slot *slot __unused)
{
prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
return OPAL_SUCCESS;
}
static int64_t npu_freset(struct pci_slot *slot __unused)
{
/* FIXME: PHB fundamental reset, which need to be
* figured out later. It's used by EEH recovery
* upon fenced AT.
*/
return OPAL_SUCCESS;
}
static struct pci_slot *npu_slot_create(struct phb *phb)
{
struct pci_slot *slot;
slot = pci_slot_alloc(phb, NULL);
if (!slot)
return slot;
/* Elementary functions */
slot->ops.get_presence_state = NULL;
slot->ops.get_link_state = npu_get_link_state;
slot->ops.get_power_state = npu_get_power_state;
slot->ops.get_attention_state = NULL;
slot->ops.get_latch_state = NULL;
slot->ops.set_power_state = NULL;
slot->ops.set_attention_state = NULL;
slot->ops.prepare_link_change = NULL;
slot->ops.poll_link = NULL;
slot->ops.hreset = npu_hreset;
slot->ops.freset = npu_freset;
slot->ops.creset = NULL;
return slot;
}
static int64_t npu_freeze_status(struct phb *phb,
uint64_t pe_number __unused,
uint8_t *freeze_state,
uint16_t *pci_error_type __unused,
uint16_t *severity __unused)
{
/* FIXME: When it's called by skiboot PCI config accessor,
* the PE number is fixed to 0, which is incorrect. We need
* introduce another PHB callback to translate it. For now,
* it keeps the skiboot PCI enumeration going.
*/
struct npu *p = phb_to_npu(phb);
if (p->fenced)
*freeze_state = OPAL_EEH_STOPPED_MMIO_DMA_FREEZE;
else
*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
return OPAL_SUCCESS;
}
static int64_t npu_eeh_next_error(struct phb *phb,
uint64_t *first_frozen_pe,
uint16_t *pci_error_type,
uint16_t *severity)
{
struct npu *p = phb_to_npu(phb);
int i;
uint64_t result = 0;
*first_frozen_pe = -1;
*pci_error_type = OPAL_EEH_NO_ERROR;
*severity = OPAL_EEH_SEV_NO_ERROR;
if (p->fenced) {
*pci_error_type = OPAL_EEH_PHB_ERROR;
*severity = OPAL_EEH_SEV_PHB_FENCED;
return OPAL_SUCCESS;
}
npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
for (i = 0; i < NPU_NUM_OF_PES; i++) {
result = in_be64(p->at_regs + NPU_IODA_DATA0);
if (result > 0) {
*first_frozen_pe = i;
*pci_error_type = OPAL_EEH_PE_ERROR;
*severity = OPAL_EEH_SEV_PE_ER;
break;
}
}
return OPAL_SUCCESS;
}
/* For use in error injection and handling. */
void npu_set_fence_state(struct npu *p, bool fence) {
p->fenced = fence;
if (fence)
prlog(PR_ERR, "NPU: Chip %x is fenced, reboot required.\n",
p->chip_id);
else
prlog(PR_WARNING, "NPU: un-fencing is dangerous and should \
only be used for development purposes.");
}
/* Sets the NPU to trigger an error when a DMA occurs */
static int64_t npu_err_inject(struct phb *phb, uint64_t pe_number,
uint32_t type, uint32_t func __unused,
uint64_t addr __unused, uint64_t mask __unused)
{
struct npu *p = phb_to_npu(phb);
struct npu_dev *dev = NULL;
int i;
if (pe_number >= NPU_NUM_OF_PES) {
prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
return OPAL_PARAMETER;
}
for (i = 0; i < p->total_devices; i++) {
if (p->devices[i].pe_number == pe_number) {
dev = &p->devices[i];
break;
}
}
if (!dev) {
prlog(PR_ERR, "NPU: couldn't find device with PE%llx\n", pe_number);
return OPAL_PARAMETER;
}
/* TODO: extend this to conform to OPAL injection standards */
if (type > 1) {
prlog(PR_ERR, "NPU: invalid error injection type\n");
return OPAL_PARAMETER;
} else if (type == 1) {
/* Emulate fence mode. */
npu_set_fence_state(p, true);
} else {
/* Cause a freeze with an invalid MMIO read. If the BAR is not
* enabled, this will checkstop the machine.
*/
npu_dev_bar_update(p->chip_id, &dev->bar, true);
in_be64((void *)dev->bar.base);
}
return OPAL_SUCCESS;
}
static const struct phb_ops npu_ops = {
.cfg_read8 = npu_cfg_read8,
.cfg_read16 = npu_cfg_read16,
.cfg_read32 = npu_cfg_read32,
.cfg_write8 = npu_cfg_write8,
.cfg_write16 = npu_cfg_write16,
.cfg_write32 = npu_cfg_write32,
.get_reserved_pe_number = NULL,
.device_init = NULL,
.phb_final_fixup = npu_phb_final_fixup,
.ioda_reset = npu_ioda_reset,
.papr_errinjct_reset = NULL,
.pci_reinit = NULL,
.set_phb_mem_window = NULL,
.phb_mmio_enable = NULL,
.map_pe_mmio_window = NULL,
.map_pe_dma_window = npu_map_pe_dma_window,
.map_pe_dma_window_real = npu_map_pe_dma_window_real,
.pci_msi_eoi = NULL,
.set_xive_pe = NULL,
.get_msi_32 = NULL,
.get_msi_64 = NULL,
.set_pe = npu_set_pe,
.set_peltv = NULL,
.eeh_freeze_status = npu_freeze_status,
.eeh_freeze_clear = NULL,
.eeh_freeze_set = NULL,
.next_error = npu_eeh_next_error,
.err_inject = npu_err_inject,
.get_diag_data2 = NULL,
.set_capi_mode = NULL,
.set_capp_recovery = NULL,
};
static void assign_mmio_bars(uint32_t gcid, uint32_t xscom,
struct dt_node *npu_dn, uint64_t mm_win[2],
uint64_t at_bar[2])
{
uint64_t mem_start, mem_end;
struct npu_dev_bar bar;
struct dt_node *link;
/* Configure BAR selection.
*
* Currently, each PHY contains 2 links and each link has 2
* BARs. The first BAR is assigned to the DLTL region which is
* what the kernel uses. The second BAR is either assigned to
* either the PL or AT region or unassigned. The PL0/PL1/AT
* MMIO regions are not exposed to the kernel so we assigned
* them at the start of the available memory area followed by
* the DLTL regions. So we end up with the following memory
* map (assuming we're given a memory region starting at
* 0x3fff000000000):
*
* Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000
* Link#0-BAR#1: PL0 BAR ( 2MB) - 0x3fff000000000
* Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000
* Link#1-BAR#1: AT BAR ( 64KB) - 0x3fff000400000
* Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000
* Link#2-BAR#1: PL1 BAR ( 2MB) - 0x3fff000200000
* Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000
* Link#3-BAR#1: UNASSIGNED
*/
xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR,
0x0211000043500000UL);
xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0,
&mem_start);
mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12;
xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0,
&mem_end);
mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) +
get_bar_size(mem_end);
/* PL0 BAR comes first at 0x3fff000000000 */
bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1;
bar.base = mem_start;
bar.size = NX_MMIO_PL_SIZE;
npu_dev_bar_update(gcid, &bar, true);
/* PL1 BAR */
bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1;
bar.base += bar.size;
bar.size = NX_MMIO_PL_SIZE;
npu_dev_bar_update(gcid, &bar, true);
/* Then the AT BAR */
bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1;
bar.base += bar.size;
bar.size = NX_MMIO_AT_SIZE;
at_bar[0] = bar.base;
at_bar[1] = NX_MMIO_AT_SIZE;
npu_dev_bar_update(gcid, &bar, true);
/* Now we configure all the DLTL BARs. These are the ones
* actually exposed to the kernel. */
mm_win[0] = bar.base + bar.size;
dt_for_each_node(npu_dn, link) {
uint32_t index;
index = dt_prop_get_u32(link, "ibm,npu-link-index");
bar.xscom = npu_link_scom_base(npu_dn, xscom, index) +
NX_MMIO_BAR_0;
bar.base += bar.size;
bar.size = NX_MMIO_DL_SIZE;
bar.base = ALIGN_UP(bar.base, bar.size);
npu_dev_bar_update(gcid, &bar, false);
}
mm_win[1] = (bar.base + bar.size) - mm_win[0];
/* If we weren't given enough room to setup all the BARs we
* require it's better to crash here than risk creating
* overlapping BARs which will xstop the machine randomly in
* the future.*/
assert(bar.base + bar.size <= mem_end);
}
/* Probe NPU device node and create PCI root device node
* accordingly. The NPU deivce node should specify number
* of links and xscom base address to access links.
*/
static void npu_probe_phb(struct dt_node *dn)
{
struct dt_node *np;
uint32_t gcid, index, phb_index, xscom;
uint64_t at_bar[2], mm_win[2];
uint32_t links;
char *path;
/* Retrieve chip id */
path = dt_get_path(dn);
gcid = dt_get_chip_id(dn);
index = dt_prop_get_u32(dn, "ibm,npu-index");
phb_index = dt_prop_get_u32(dn, "ibm,phb-index");
links = dt_prop_get_u32(dn, "ibm,npu-links");
prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n",
gcid, index, links, path);
free(path);
/* Retrieve xscom base addr */
xscom = dt_get_address(dn, 0, NULL);
prlog(PR_INFO, " XSCOM Base: %08x\n", xscom);
assign_mmio_bars(gcid, xscom, dn, mm_win, at_bar);
prlog(PR_INFO, " AT BAR: %016llx (%lldKB)\n",
at_bar[0], at_bar[1] / 0x400);
/* Create PCI root device node */
np = dt_new_addr(dt_root, "pciex", at_bar[0]);
assert(np);
dt_add_property_strings(np, "compatible",
"ibm,power8-npu-pciex", "ibm,ioda2-npu-phb");
dt_add_property_strings(np, "device_type", "pciex");
dt_add_property(np, "reg", at_bar, sizeof(at_bar));
dt_add_property_cells(np, "ibm,phb-index", phb_index);
dt_add_property_cells(np, "ibm,npu-index", index);
dt_add_property_cells(np, "ibm,chip-id", gcid);
dt_add_property_cells(np, "ibm,xscom-base", xscom);
dt_add_property_cells(np, "ibm,npcq", dn->phandle);
dt_add_property_cells(np, "ibm,links", links);
dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
dt_add_property_cells(np, "ibm,phb-diag-data-size", 0);
/* Disable fast reboot - not currently supported */
disable_fast_reboot("NVLink device enabled");
}
static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap)
{
struct npu_dev *dev = cap->dev;
struct pci_virt_device *pvd = dev->pvd;
uint32_t offset = cap->start;
uint8_t val;
/* Add length and version information */
val = cap->end - cap->start;
PCI_VIRT_CFG_INIT_RO(pvd, offset + 2, 1, val);
PCI_VIRT_CFG_INIT_RO(pvd, offset + 3, 1, OPAL_NPU_VERSION);
offset += 4;
/* Defaults when the trap can't handle the read/write (eg. due
* to reading/writing less than 4 bytes). */
val = 0x0;
PCI_VIRT_CFG_INIT_RO(pvd, offset, 4, val);
PCI_VIRT_CFG_INIT_RO(pvd, offset + 4, 4, val);
/* Create a trap for AT/PL procedures */
pci_virt_add_filter(pvd, offset, 8,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu_dev_procedure, NULL);
offset += 8;
PCI_VIRT_CFG_INIT_RO(pvd, offset, 1, dev->index);
}
static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
{
struct npu_dev *dev = cap->dev;
struct pci_virt_device *pvd = dev->pvd;
uint32_t base = cap->start;
uint32_t val;
/* Sanity check on capability ID */
if (cap->id != PCI_CFG_CAP_ID_EXP) {
prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n",
__func__, cap->id, PCI_CFG_CAP_ID_EXP);
return;
}
/* Sanity check on spanned registers */
if ((cap->end - cap->start) < PCIE_CAP_START) {
prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n",
__func__, cap->start, cap->end, cap->id);
return;
}
/* 0x00 - ID/PCIE capability */
val = cap->id;
val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
PCI_VIRT_CFG_INIT_RO(pvd, base, 4, val);
/* 0x04 - Device capability
*
* We should support FLR. Otherwise, it might have
* problem passing it through to userland via Linux
* VFIO infrastructure
*/
val = ((PCIE_MPSS_128) |
(PCIE_PHANTOM_NONE << 3) |
(PCIE_L0SL_MAX_NO_LIMIT << 6) |
(PCIE_L1L_MAX_NO_LIMIT << 9) |
(PCICAP_EXP_DEVCAP_FUNC_RESET));
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_DEVCAP, 4, val);
pci_virt_add_filter(pvd, base + PCICAP_EXP_DEVCTL, 2,
PCI_REG_FLAG_WRITE,
npu_dev_cfg_exp_devcap, NULL);
/* 0x08 - Device control and status */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DEVCTL, 4, 0x00002810,
0xffff0000, 0x000f0000);
/* 0x0c - Link capability */
val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP, 4, val);
/* 0x10 - Link control and status */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL, 4, 0x00130000,
0xfffff000, 0xc0000000);
/* 0x14 - Slot capability */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
/* 0x18 - Slot control and status */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
/* 0x1c - Root control and capability */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RC, 4, 0x00000000,
0xffffffe0, 0x00000000);
/* 0x20 - Root status */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_RSTAT, 4, 0x00000000,
0xffffffff, 0x00010000);
/* 0x24 - Device capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCIECAP_EXP_DCAP2, 4, 0x00000000);
/* 0x28 - Device Control and status 2 */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_DCTL2, 4, 0x00070000,
0xffff0000, 0x00000000);
/* 0x2c - Link capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_LCAP2, 4, 0x00000007);
/* 0x30 - Link control and status 2 */
PCI_VIRT_CFG_INIT(pvd, base + PCICAP_EXP_LCTL2, 4, 0x00000003,
0xffff0000, 0x00200000);
/* 0x34 - Slot capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCAP2, 4, 0x00000000);
/* 0x38 - Slot control and status 2 */
PCI_VIRT_CFG_INIT_RO(pvd, base + PCICAP_EXP_SCTL2, 4, 0x00000000);
}
static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev,
void (*populate)(struct npu_dev_cap *),
uint16_t id,
uint16_t start,
uint16_t end)
{
struct npu_dev_cap *cap;
/* Check if the capability is existing */
cap = npu_dev_find_capability(dev, id);
if (cap)
return cap;
/* Allocate new one */
cap = zalloc(sizeof(struct npu_dev_cap));
assert(cap);
/* Put it into the pool */
cap->id = id;
cap->start = start;
cap->end = end;
cap->dev = dev;
cap->populate = populate;
list_add_tail(&dev->capabilities, &cap->link);
return cap;
}
static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
uint16_t id)
{
struct npu_dev_cap *cap;
list_for_each(&dev->capabilities, cap, link) {
if (cap->id == id)
return cap;
}
return NULL;
}
/*
* All capabilities should be put into the device capability
* list according to register offset in ascending order for
* easy access at later point.
*/
static void npu_dev_create_capabilities(struct npu_dev *dev)
{
list_head_init(&dev->capabilities);
/* PCI express capability */
npu_dev_create_capability(dev, npu_dev_populate_pcie_cap,
PCI_CFG_CAP_ID_EXP, PCIE_CAP_START,
PCIE_CAP_END);
/* Vendor specific capability */
npu_dev_create_capability(dev, npu_dev_populate_vendor_cap,
PCI_CFG_CAP_ID_VENDOR, VENDOR_CAP_START,
VENDOR_CAP_END);
}
static void npu_dev_create_cfg(struct npu_dev *dev)
{
struct pci_virt_device *pvd = dev->pvd;
struct npu_dev_cap *cap;
uint32_t offset;
uint32_t last_cap_offset;
/* 0x00 - Vendor/Device ID */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
/* 0x04 - Command/Status
*
* Create one trap to trace toggling memory BAR enable bit
*/
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
0xf9000000);
pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
npu_dev_cfg_write_cmd, NULL);
/* 0x08 - Rev/Class/Cache */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800100);
/* 0x0c - CLS/Latency Timer/Header/BIST */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
/* 0x10 - BARs, always 64-bits non-prefetchable
*
* Each emulated device represents one link and therefore
* there is one BAR for the associated DLTL region.
*/
/* Low 32-bits */
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
(dev->bar.base & 0xfffffff0) | dev->bar.flags,
0x0000000f, 0x00000000);
/* High 32-bits */
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (dev->bar.base >> 32),
0x00000000, 0x00000000);
/*
* Create trap. Writting 0xFF's to BAR registers should be
* trapped and return size on next read
*/
pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu_dev_cfg_bar, &dev->bar);
/* 0x18/1c/20/24 - Disabled BAR#2/3/4/5
*
* Mark those BARs readonly so that 0x0 will be returned when
* probing the length and the BARs will be skipped.
*/
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR2, 4, 0x00000000);
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR3, 4, 0x00000000);
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
/* 0x28 - Cardbus CIS pointer */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
/* 0x2c - Subsystem ID */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
/* 0x30 - ROM BAR
*
* Force its size to be zero so that the kernel will skip
* probing the ROM BAR. We needn't emulate ROM BAR.
*/
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
/* 0x34 - PCI Capability
*
* By default, we don't have any capabilities
*/
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
last_cap_offset = PCI_CFG_CAP - 1;
list_for_each(&dev->capabilities, cap, link) {
offset = cap->start;
/* Initialize config space for the capability */
if (cap->populate)
cap->populate(cap);
/* Add capability header */
PCI_VIRT_CFG_INIT_RO(pvd, offset, 2, cap->id);
/* Update the next capability pointer */
PCI_VIRT_CFG_NORMAL_WR(pvd, last_cap_offset + 1, 1, offset);
last_cap_offset = offset;
}
/* 0x38 - Reserved */
PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
if (!(dev->index % 2))
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100);
else
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200);
}
static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t group)
{
int i;
int bdfn = (group << 3);
for (i = 0; i < p->total_devices; i++) {
if ((p->devices[i].pvd->bdfn & 0xf8) == (bdfn & 0xf8))
bdfn++;
}
return bdfn;
}
static void npu_create_devices(struct dt_node *dn, struct npu *p)
{
struct npu_dev *dev;
struct dt_node *npu_dn, *link;
uint32_t bdfn, npu_phandle, index = 0;
uint64_t buid_reg;
uint64_t lsisrcid;
uint64_t buid;
/* The bits in the LSI ID Base register are always compared and
* can be set to 0 in the buid base and mask fields. The
* buid (bus unit id) is the full irq minus the last 4 bits. */
lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, NPU_LSI_SRC_ID_BASE);
buid = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) >> 4;
buid_reg = SETFIELD(NP_IRQ_LEVELS, NP_BUID_ENABLE, ~0);
buid_reg = SETFIELD(NP_BUID_MASK, buid_reg, ~lsisrcid);
buid_reg = SETFIELD(NP_BUID_BASE, buid_reg, (buid & ~lsisrcid));
/* Get the npu node which has the links which we expand here
* into pci like devices attached to our emulated phb. */
npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
npu_dn = dt_find_by_phandle(dt_root, npu_phandle);
assert(npu_dn);
/* Walk the link@x nodes to initialize devices */
p->total_devices = 0;
p->phb.scan_map = 0;
list_head_init(&p->phb.virt_devices);
dt_for_each_compatible(npu_dn, link, "ibm,npu-link") {
struct npu_dev_bar *bar;
uint32_t group_id;
uint64_t val;
dev = &p->devices[index];
dev->index = dt_prop_get_u32(link, "ibm,npu-link-index");
dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base,
dev->index);
dev->npu = p;
dev->dt_node = link;
/* We don't support MMIO PHY access yet */
dev->pl_base = NULL;
group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
bdfn = npu_allocate_bdfn(p, group_id);
/* This must be done after calling
* npu_allocate_bdfn() */
p->total_devices++;
p->phb.scan_map |= 0x1 << ((bdfn & 0xf8) >> 3);
dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
/* Setup BUID/ISRN */
xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid_reg);
/* Create PCI virtual device */
dev->pvd = pci_virt_add_device(&p->phb, bdfn, NPU_DEV_CFG_SIZE, dev);
assert(dev->pvd);
bar = &dev->bar;
bar->flags = (PCI_CFG_BAR_TYPE_MEM |
PCI_CFG_BAR_MEM64);
/* Update BAR info */
bar->xscom = dev->xscom + NX_MMIO_BAR_0;
xscom_read(p->chip_id, bar->xscom, &val);
bar->base = GETFIELD(NX_MMIO_BAR_BASE, val) << 12;
bar->size = get_bar_size(val);
/*
* The config space is initialised with the BARs
* disabled, so make sure it is actually disabled in
* hardware.
*/
npu_dev_bar_update(p->chip_id, bar, false);
/* Initialize capabilities */
npu_dev_create_capabilities(dev);
/* Initialize config space */
npu_dev_create_cfg(dev);
index++;
}
}
static void npu_add_phb_properties(struct npu *p)
{
struct dt_node *np = p->phb.dt_node;
uint32_t icsp = get_ics_phandle();
uint64_t tkill, mm_base, mm_size;
uint32_t base_lsi = p->base_lsi;
uint32_t map[] = {
/* Dev 0 INT#A (used by fn0) */
0x0000, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL0, 1,
/* Dev 0 INT#B (used by fn1) */
0x0000, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL1, 1,
/* Dev 1 INT#A (used by fn0) */
0x0800, 0x0, 0x0, 0x1, icsp, base_lsi + NPU_LSI_INT_DL2, 1,
/* Dev 1 INT#B (used by fn1) */
0x0800, 0x0, 0x0, 0x2, icsp, base_lsi + NPU_LSI_INT_DL3, 1,
};
/* Mask is bus, device and INT# */
uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7};
char slotbuf[32];
/* Add various properties that HB doesn't have to
* add, some of them simply because they result from
* policy decisions made in skiboot rather than in HB
* such as the MMIO windows going to PCI, interrupts,
* etc.
*/
dt_add_property_cells(np, "#address-cells", 3);
dt_add_property_cells(np, "#size-cells", 2);
dt_add_property_cells(np, "#interrupt-cells", 1);
dt_add_property_cells(np, "bus-range", 0, 0xff);
dt_add_property_cells(np, "clock-frequency", 0x200, 0);
dt_add_property_cells(np, "interrupt-parent", icsp);
/* DLPL Interrupts, we don't use the standard swizzle */
p->phb.lstate.int_size = 0;
dt_add_property(np, "interrupt-map", map, sizeof(map));
dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask));
/* NPU PHB properties */
/* TODO: Due to an errata TCE KILL only works when DMA traffic
* has been stopped. We need to implement the work around
* which is to do a TCE kill all instead. */
tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL;
dt_add_property_cells(np, "ibm,opal-num-pes",
NPU_NUM_OF_PES);
dt_add_property_cells(np, "ibm,opal-reserved-pe",
0);
dt_add_property_u64(np, "ibm,opal-tce-kill", tkill);
/* Memory window is exposed as 32-bits non-prefetchable
* one because 64-bits prefetchable one is kind of special
* to kernel.
*/
mm_base = p->mm_base;
mm_size = p->mm_size;
dt_add_property_cells(np, "ranges", 0x02000000,
hi32(mm_base), lo32(mm_base),
hi32(mm_base), lo32(mm_base),
hi32(mm_size), lo32(mm_size));
/* Set the slot location on the NPU PHB. This PHB can contain
* devices that correlate with multiple physical slots, so
* present the chip ID instead.
*/
snprintf(slotbuf, sizeof(slotbuf), "NPU Chip %d", p->chip_id);
dt_add_property_string(np, "ibm,io-base-loc-code", slotbuf);
}
static void npu_create_phb(struct dt_node *dn)
{
const struct dt_property *prop;
struct npu *p;
struct pci_slot *slot;
uint32_t links;
void *pmem;
/* Retrieve number of devices */
links = dt_prop_get_u32(dn, "ibm,links");
pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev));
assert(pmem);
/* Populate PHB */
p = pmem;
p->index = dt_prop_get_u32(dn, "ibm,npu-index");
p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id");
p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base");
p->total_devices = links;
/* TODO: When hardware fences are implemented, detect them here */
p->fenced = false;
/* This is the AT base */
p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET;
p->at_regs = (void *)dt_get_address(dn, 0, NULL);
prop = dt_require_property(dn, "ibm,mmio-window", -1);
assert(prop->len >= (2 * sizeof(uint64_t)));
p->mm_base = ((const uint64_t *)prop->prop)[0];
p->mm_size = ((const uint64_t *)prop->prop)[1];
p->devices = pmem + sizeof(struct npu);
/* Interrupt */
p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) +
NPU_LSI_IRQ_MIN;
/* Generic PHB */
p->phb.dt_node = dn;
p->phb.ops = &npu_ops;
p->phb.phb_type = phb_type_pcie_v3;
/* Populate devices */
npu_create_devices(dn, p);
/* Populate extra properties */
npu_add_phb_properties(p);
/* Create PHB slot */
slot = npu_slot_create(&p->phb);
if (!slot)
{
/**
* @fwts-label NPUCannotCreatePHBSlot
* @fwts-advice Firmware probably ran out of memory creating
* NPU slot. NVLink functionality could be broken.
*/
prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
}
/* Register PHB */
pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID);
/* Initialize IODA cache */
npu_ioda_init(p);
/* Register interrupt source */
npu_register_irq(p);
/* Initialize hardware */
npu_hw_init(p);
}
void probe_npu(void)
{
struct dt_node *np;
/* Scan NPU XSCOM nodes */
dt_for_each_compatible(dt_root, np, "ibm,power8-npu")
npu_probe_phb(np);
/* Scan newly created PHB nodes */
dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex")
npu_create_phb(np);
}