blob: 920864b328bd476247c921c2584f7a38161d9cb7 [file] [log] [blame]
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
/*
* Copyright 2019 IBM Corp.
*/
#include <skiboot.h>
#include <device.h>
#include <phys-map.h>
#include <npu3.h>
#include <npu3-regs.h>
#include <pci-virt.h>
#include <xscom.h>
#include <xscom-p9-regs.h>
#include <interrupts.h>
#include <pci-cfg.h>
#include <pci-slot.h>
#include <cache-p9.h>
#define NPU3LOG(l, npu, fmt, a...) \
prlog(l, "NPU#%04x[%d:%d]: " fmt, \
(npu)->nvlink.phb.opal_id, \
(npu)->chip_id, \
(npu)->index, ##a)
#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
#define NPU3DEVLOG(l, dev, fmt, a...) \
prlog(l, "NPU#%04x:%02x:%02x.%x " fmt, \
(dev)->npu->nvlink.phb.opal_id, \
PCI_BUS_NUM((dev)->nvlink.pvd->bdfn), \
PCI_DEV((dev)->nvlink.pvd->bdfn), \
PCI_FUNC((dev)->nvlink.pvd->bdfn), ##a)
#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
#define NPU3_CFG_READ(size, type) \
static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn, \
uint32_t offset, type *data) \
{ \
uint32_t val; \
int64_t ret; \
\
ret = pci_virt_cfg_read(phb, bdfn, offset, \
sizeof(*data), &val); \
*data = (type)val; \
return ret; \
}
#define NPU3_CFG_WRITE(size, type) \
static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn, \
uint32_t offset, type data) \
{ \
uint32_t val = data; \
int64_t ret; \
\
ret = pci_virt_cfg_write(phb, bdfn, offset, \
sizeof(data), val); \
return ret; \
}
NPU3_CFG_READ(8, u8);
NPU3_CFG_READ(16, u16);
NPU3_CFG_READ(32, u32);
NPU3_CFG_WRITE(8, u8);
NPU3_CFG_WRITE(16, u16);
NPU3_CFG_WRITE(32, u32);
static int64_t npu3_eeh_freeze_status(struct phb *phb __unused,
uint64_t pe_num __unused,
uint8_t *freeze_state,
uint16_t *pci_error_type,
uint16_t *severity)
{
/*
* FIXME: When it's called by skiboot PCI config accessor,
* the PE number is fixed to 0, which is incorrect. We need
* introduce another PHB callback to translate it. For now,
* it keeps the skiboot PCI enumeration going.
*/
*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
*pci_error_type = OPAL_EEH_NO_ERROR;
if (severity)
*severity = OPAL_EEH_SEV_NO_ERROR;
return OPAL_SUCCESS;
}
/* Number of PEs supported */
#define NPU3_MAX_PE_NUM 16
#define NPU3_RESERVED_PE_NUM 15
static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint64_t val;
val = NPU3_ATS_IODA_ADDR_AUTO_INC;
val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val,
NPU3_ATS_IODA_ADDR_TBL_TVT);
npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++)
npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
return OPAL_SUCCESS;
}
static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table,
uint32_t index)
{
uint64_t val;
val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table);
val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index);
npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
}
static int64_t npu3_map_pe_dma_window(struct phb *phb,
uint64_t pe_num,
uint16_t window_id,
uint16_t tce_levels,
uint64_t tce_table_addr,
uint64_t tce_table_size,
uint64_t tce_page_size)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint64_t tts_encoded, val;
uint32_t page_size;
/* Each PE has one corresponding TVE */
if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
return OPAL_PARAMETER;
npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
/* TCE table size zero is used to disable the TVE */
if (!tce_table_size) {
npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
return OPAL_SUCCESS;
}
/* TCE table size */
if (!is_pow2(tce_table_size) || tce_table_size < 0x1000)
return OPAL_PARAMETER;
tts_encoded = ilog2(tce_table_size) - 11;
if (tts_encoded > 39)
return OPAL_PARAMETER;
val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded);
/* Number of levels */
if (tce_levels < 1 || tce_levels > 4)
return OPAL_PARAMETER;
val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1);
/* TCE page size */
switch (tce_page_size) {
case 256 << 20:
page_size = 17;
break;
case 16 << 20:
page_size = 13;
break;
case 64 << 10:
page_size = 5;
break;
default:
page_size = 1;
}
val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size);
val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12);
npu3_write(npu, NPU3_ATS_IODA_DATA, val);
return OPAL_SUCCESS;
}
static int64_t npu3_map_pe_dma_window_real(struct phb *phb,
uint64_t pe_num,
uint16_t window_id,
uint64_t pci_start_addr __unused,
uint64_t pci_mem_size __unused)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint64_t val;
/* Each PE has one corresponding TVE */
if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
return OPAL_PARAMETER;
if (pci_mem_size) {
/*
* GPUs need to be able to access the MMIO memory space as well.
* On POWER9 this is above the top of RAM, so disable the TVT
* range check, allowing access to all memory addresses.
*/
val = 0;
} else {
/* Disable */
val = PPC_BIT(51);
}
npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
npu3_write(npu, NPU3_ATS_IODA_DATA, val);
return OPAL_SUCCESS;
}
static int64_t npu3_next_error(struct phb *phb,
uint64_t *first_frozen_pe,
uint16_t *pci_error_type,
uint16_t *severity)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint64_t val;
uint32_t pe_num;
if (!first_frozen_pe || !pci_error_type || !severity)
return OPAL_PARAMETER;
*first_frozen_pe = -1;
*pci_error_type = OPAL_EEH_NO_ERROR;
*severity = OPAL_EEH_SEV_NO_ERROR;
for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) {
val = npu3_read(npu, NPU3_MISC_PESTB_DATA(pe_num));
if (!GETFIELD(NPU3_MISC_PESTB_DATA_DMA_STOPPED_STATE, val))
continue;
*first_frozen_pe = pe_num;
*pci_error_type = OPAL_EEH_PE_ERROR;
*severity = OPAL_EEH_SEV_PE_ER;
break;
}
return OPAL_SUCCESS;
}
static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn)
{
struct pci_virt_device *pvd;
/* All emulated devices are attached to root bus */
if (bdfn & ~0xff)
return NULL;
pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn);
if (pvd)
return pvd->data;
return NULL;
}
static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd,
void *data)
{
const char *slot = data;
struct dt_node *dn;
char *loc_code;
/* Ignore non-NVIDIA devices */
if (PCI_VENDOR_ID(pd->vdid) != 0x10de)
return 0;
/* Find the PCI device's slot location */
for (dn = pd->dn;
dn && !dt_find_property(dn, "ibm,loc-code");
dn = dn->parent);
if (!dn)
return 0;
loc_code = (char *)dt_prop_get(dn, "ibm,loc-code");
if (streq(loc_code, slot))
return 1;
return 0;
}
static void npu3_dev_find_gpu(struct npu3_dev *dev)
{
const char *slot = dev->nvlink.loc_code;
struct phb *phb;
struct pci_device *gpu;
if (!slot)
return;
for_each_phb(phb) {
gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot);
if (!gpu)
continue;
dev->nvlink.gpu = gpu;
return;
}
NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot);
}
#define VENDOR_CAP_START 0x80
#define VENDOR_CAP_LINK_FLAG_OFFSET 0x0d
void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag)
{
uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
uint32_t flags;
PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
flags |= flag;
PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
}
void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag)
{
uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
uint32_t flags;
PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
flags &= ~flag;
PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
}
static struct lock npu3_phandle_lock = LOCK_UNLOCKED;
static void npu3_append_phandle(struct dt_node *dn, const char *name,
uint32_t phandle)
{
struct dt_property *prop;
uint32_t *phandles;
size_t len;
prop = __dt_find_property(dn, name);
if (!prop) {
dt_add_property_cells(dn, name, phandle);
return;
}
/*
* Make sure no one else has a reference to the property. Assume
* this is the only function that holds a reference to it.
*/
lock(&npu3_phandle_lock);
/* Need to append to the property */
len = prop->len + sizeof(*phandles);
dt_resize_property(&prop, len);
phandles = (uint32_t *)prop->prop;
phandles[len / sizeof(*phandles) - 1] = phandle;
unlock(&npu3_phandle_lock);
}
static void npu3_dev_fixup_dt(struct npu3_dev *dev)
{
struct pci_device *pd = dev->nvlink.pd;
struct pci_device *gpu = dev->nvlink.gpu;
dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle);
dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code);
if (dev->link_speed != 0xff)
dt_add_property_cells(pd->dn, "ibm,nvlink-speed",
lo32(dev->link_speed));
if (!gpu)
return;
npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle);
dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle);
}
static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev,
struct pci_cfg_reg_filter *pcrf __unused,
uint32_t offset, uint32_t len,
uint32_t *data, bool write)
{
struct pci_device *pd = pdev;
struct pci_device *gpu;
struct npu3 *npu;
struct npu3_dev *dev;
bool purge = false;
if (!write)
return OPAL_PARAMETER;
if (len != 2 || offset & 1) {
PCIERR(pd->phb, pd->bdfn,
"Unsupported write to bridge control register\n");
return OPAL_PARAMETER;
}
if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET))
return OPAL_PARTIAL;
gpu = list_top(&pd->children, struct pci_device, link);
if (!gpu)
return OPAL_PARTIAL;
npu3_for_each_nvlink_npu(npu)
npu3_for_each_nvlink_dev(dev, npu)
if (dev->nvlink.gpu == gpu)
if (!npu3_dev_reset(dev))
purge = true;
if (purge)
purge_l2_l3_caches();
return OPAL_PARTIAL;
}
static int npu3_dev_bind(struct phb *phb, struct pci_device *pd,
void *data __unused)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn);
struct pci_device *gpu;
dev->nvlink.pd = pd;
/* The slot label indicates which GPU this link is connected to */
dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL);
if (!dev->nvlink.loc_code) {
/**
* @fwts-label NPUNoPHBSlotLabel
* @fwts-advice No GPU/NPU slot information was found.
* NVLink3 functionality will not work.
*/
NPU3DEVERR(dev, "Cannot find GPU slot information\n");
}
npu3_dev_find_gpu(dev);
npu3_dev_fixup_dt(dev);
gpu = dev->nvlink.gpu;
if (!gpu)
return 0;
/* When a GPU is reset, ensure all of its links are reset too */
if (gpu->parent && gpu->parent->slot)
pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2,
PCI_REG_FLAG_WRITE,
npu3_gpu_bridge_sec_bus_reset);
npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED);
return 0;
}
struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id)
{
uint64_t phb_id = 0;
struct phb *phb;
if (npu)
phb_id = npu->nvlink.phb.opal_id + 1;
for (; (phb = __pci_next_phb_idx(&phb_id));) {
if (phb->phb_type != phb_type_npu_v3)
continue;
npu = npu3_phb_to_npu(phb);
if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP)
return npu;
}
return NULL;
}
static struct npu3 *npu3_last_npu(void)
{
static struct npu3 *last = NULL;
struct npu3 *npu;
if (last)
return last;
npu3_for_each_nvlink_npu(npu)
last = npu;
return last;
}
static uint32_t npu3_gpu_links(struct pci_device *gpu)
{
const struct dt_property *prop;
if (!gpu)
return 0;
/* The link count is the number of phandles in "ibm,npu" */
prop = dt_find_property(gpu->dn, "ibm,npu");
if (!prop)
return 0;
return prop->len / sizeof(uint32_t);
}
static uint32_t npu3_links_per_gpu(void)
{
struct npu3 *npu;
struct npu3_dev *dev;
uint32_t links = 0;
/* Use the first GPU we find to figure this out */
npu3_for_each_nvlink_npu(npu) {
npu3_for_each_nvlink_dev(dev, npu) {
links = npu3_gpu_links(dev->nvlink.gpu);
if (links)
goto out;
}
}
out:
prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links);
return links;
}
int32_t npu3_dev_gpu_index(struct npu3_dev *dev)
{
const char *slot;
char *p = NULL;
int ret;
slot = dev->nvlink.loc_code;
if (!slot)
return -1;
if (memcmp(slot, "GPU", 3))
return -1;
ret = strtol(slot + 3, &p, 10);
if (*p || p == slot + 3)
return -1;
return ret;
}
static uint32_t npu3_chip_possible_gpu_links(void)
{
struct proc_chip *chip;
struct npu3 *npu;
struct npu3_dev *dev;
uint32_t possible = 0;
for_each_chip(chip) {
npu3_for_each_chip_nvlink_npu(npu, chip->id)
npu3_for_each_nvlink_dev(dev, npu)
if (npu3_dev_gpu_index(dev) != -1)
possible++;
if (possible)
break;
}
prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
return possible;
}
uint32_t npu3_chip_possible_gpus(void)
{
static uint32_t possible = -1;
uint32_t links_per_gpu;
/* Static value, same for all chips; only do this once */
if (possible != -1)
return possible;
possible = 0;
links_per_gpu = npu3_links_per_gpu();
if (links_per_gpu)
possible = npu3_chip_possible_gpu_links() / links_per_gpu;
prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
return possible;
}
static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr,
uint64_t size)
{
uint32_t mode;
uint64_t val;
switch (npu3_gpu_links(dev->nvlink.gpu)) {
case 0:
return;
case 1:
mode = 0;
break;
case 2:
mode = 1;
break;
case 3:
mode = 3;
break;
case 4:
mode = 6;
break;
case 6:
mode = 10;
break;
default:
/* Hardware does not support this configuration */
assert(0);
}
mode += PCI_FUNC(dev->nvlink.pvd->bdfn);
val = NPU3_GPU_MEM_BAR_ENABLE |
NPU3_GPU_MEM_BAR_POISON;
val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30);
val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30);
val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode);
npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val);
}
static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev,
uint32_t gpu_index, uint64_t addr,
uint64_t size)
{
uint32_t nid = 255 - gpu_index;
struct dt_node *mem;
mem = dt_find_by_name_addr(dt_root, "memory", addr);
if (mem)
return mem;
mem = dt_new_addr(dt_root, "memory", addr);
assert(mem);
dt_add_property_string(mem, "device_type", "memory");
dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
dt_add_property_u64s(mem, "reg", addr, size);
dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
dt_add_property_cells(mem, "ibm,chip-id", nid);
dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid);
NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code,
addr, nid);
return mem;
}
static void npu3_dev_init_gpu_mem(struct npu3_dev *dev)
{
struct pci_device *pd = dev->nvlink.pd;
struct npu3 *npu = dev->npu;
struct dt_node *mem;
uint64_t addr, size, gta;
uint32_t gpu_index;
if (!dev->nvlink.gpu)
return;
gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus();
phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size);
npu3_dev_assign_gmb(dev, addr, size);
mem = npu3_create_memory_dn(dev, gpu_index, addr, size);
/*
* Coral mode address compression. This is documented in Figure 3.5 of
* the NPU workbook; "P9->GPU RA Compression (Coral)".
*/
gta = (addr >> 42 & 0x1) << 42;
gta |= (addr >> 45 & 0x3) << 43;
gta |= (addr >> 49 & 0x3) << 45;
gta |= addr & ((1ul << 43) - 1);
dt_add_property_cells(pd->dn, "memory-region", mem->phandle);
dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta);
}
static void npu3_final_fixup(void)
{
struct npu3 *npu;
struct npu3_dev *dev;
npu3_for_each_nvlink_npu(npu)
npu3_for_each_nvlink_dev(dev, npu)
npu3_dev_init_gpu_mem(dev);
}
static void npu3_phb_final_fixup(struct phb *phb)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
pci_walk_dev(phb, NULL, npu3_dev_bind, NULL);
/*
* After every npu's devices are bound, do gpu-related fixup. This
* counts on npu3_last_npu() walking the phbs in the same order as
* the PHB final fixup loop in __pci_init_slots().
*/
if (npu == npu3_last_npu())
npu3_final_fixup();
}
static int64_t npu3_set_pe(struct phb *phb,
uint64_t pe_num,
uint64_t bdfn,
uint8_t bcompare,
uint8_t dcompare,
uint8_t fcompare,
uint8_t action)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
struct npu3_dev *dev;
uint64_t val;
dev = npu3_bdfn_to_dev(npu, bdfn);
if (!dev)
return OPAL_PARAMETER;
if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
return OPAL_PARAMETER;
if (pe_num >= NPU3_MAX_PE_NUM)
return OPAL_PARAMETER;
if (bcompare != OpalPciBusAll ||
dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
return OPAL_UNSUPPORTED;
if (!dev->nvlink.gpu)
return OPAL_SUCCESS;
val = NPU3_CTL_BDF2PE_CFG_ENABLE;
val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num);
val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val);
val = NPU3_MISC_BDF2PE_CFG_ENABLE;
val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num);
val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val);
return OPAL_SUCCESS;
}
static int64_t npu3_tce_kill_pages(struct npu3 *npu,
uint64_t pe_num,
uint32_t tce_size,
uint64_t dma_addr,
uint32_t npages)
{
uint32_t check_tce_size;
uint64_t val;
if (pe_num >= NPU3_MAX_PE_NUM)
return OPAL_PARAMETER;
npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
val = npu3_read(npu, NPU3_ATS_IODA_DATA);
check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val);
if (check_tce_size != tce_size) {
NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n",
__func__, tce_size, check_tce_size);
return OPAL_PARAMETER;
}
val = NPU3_ATS_TCE_KILL_ONE;
val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num);
while (npages--) {
val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12);
npu3_write(npu, NPU3_ATS_TCE_KILL, val);
dma_addr += tce_size;
}
return OPAL_SUCCESS;
}
static int64_t npu3_tce_kill(struct phb *phb,
uint32_t kill_type,
uint64_t pe_num,
uint32_t tce_size,
uint64_t dma_addr,
uint32_t npages)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
sync();
switch(kill_type) {
case OPAL_PCI_TCE_KILL_PAGES:
return npu3_tce_kill_pages(npu, pe_num, tce_size,
dma_addr, npages);
case OPAL_PCI_TCE_KILL_PE:
/*
* NPU doesn't support killing a PE so fall through
* and do a kill all instead.
*/
case OPAL_PCI_TCE_KILL_ALL:
npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL);
return OPAL_SUCCESS;
}
return OPAL_PARAMETER;
}
static const struct phb_ops npu_ops = {
.cfg_read8 = npu3_cfg_read8,
.cfg_read16 = npu3_cfg_read16,
.cfg_read32 = npu3_cfg_read32,
.cfg_write8 = npu3_cfg_write8,
.cfg_write16 = npu3_cfg_write16,
.cfg_write32 = npu3_cfg_write32,
.eeh_freeze_status = npu3_eeh_freeze_status,
.ioda_reset = npu3_ioda_reset,
.map_pe_dma_window = npu3_map_pe_dma_window,
.map_pe_dma_window_real = npu3_map_pe_dma_window_real,
.next_error = npu3_next_error,
.phb_final_fixup = npu3_phb_final_fixup,
.set_pe = npu3_set_pe,
.tce_kill = npu3_tce_kill,
};
static int64_t npu3_reset(struct pci_slot *slot)
{
struct npu3 *npu = npu3_phb_to_npu(slot->phb);
struct npu3_dev *dev;
int64_t rc = OPAL_SUCCESS;
bool purge = false;
npu3_for_each_nvlink_dev(dev, npu) {
rc = npu3_dev_reset(dev);
if (rc)
break;
purge = true;
}
/* No devices reset; don't purge, just return */
if (!purge)
return rc;
/* All devices reset */
if (!rc)
return purge_l2_l3_caches();
/* Some devices successfully reset; purge, but still return error */
purge_l2_l3_caches();
return rc;
}
static int64_t npu3_freset(struct pci_slot *slot __unused)
{
return OPAL_SUCCESS;
}
static int64_t npu3_get_link_state(struct pci_slot *slot __unused,
uint8_t *val)
{
*val = OPAL_SHPC_LINK_UP_x1;
return OPAL_SUCCESS;
}
static int64_t npu3_get_power_state(struct pci_slot *slot __unused,
uint8_t *val)
{
*val = PCI_SLOT_POWER_ON;
return OPAL_SUCCESS;
}
static void npu3_create_phb_slot(struct npu3 *npu)
{
struct pci_slot *slot;
slot = pci_slot_alloc(&npu->nvlink.phb, NULL);
if (!slot)
return;
/* Elementary functions */
slot->ops.creset = npu3_reset;
slot->ops.freset = npu3_freset;
slot->ops.hreset = npu3_reset;
slot->ops.get_link_state = npu3_get_link_state;
slot->ops.get_power_state = npu3_get_power_state;
}
static void npu3_create_phb(struct npu3 *npu)
{
struct phb *phb = &npu->nvlink.phb;
phb->phb_type = phb_type_npu_v3;
phb->ops = &npu_ops;
phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]);
assert(phb->dt_node);
list_head_init(&phb->virt_devices);
pci_register_phb(phb, npu3_get_opal_id(npu->chip_id,
npu3_get_phb_index(npu->index)));
npu3_create_phb_slot(npu);
npu3_ioda_reset(phb, true);
}
static void npu3_dev_init_hw(struct npu3_dev *dev)
{
struct npu3 *npu = dev->npu;
uint64_t reg, val;
reg = NPU3_RELAXED_CFG2(dev->index);
val = npu3_read(npu, reg);
val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W |
NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP |
NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ |
NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ |
NPU3_RELAXED_CFG2_CMD_DMA_PR_W |
NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 |
NPU3_RELAXED_CFG2_SRC_RDENA(0);
npu3_write(npu, reg, val);
reg = NPU3_NTL_MISC_CFG2(dev->index);
val = npu3_read(npu, reg);
val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE |
NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
npu3_write(npu, reg, val);
}
static void npu3_init_hw(struct npu3 *npu)
{
struct npu3_dev *dev;
uint64_t reg, val;
reg = NPU3_XTS_CFG;
val = npu3_read(npu, reg);
val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO;
npu3_write(npu, reg, val);
reg = NPU3_XTS_CFG2;
val = npu3_read(npu, reg);
val |= NPU3_XTS_CFG2_NO_FLUSH_ENA;
npu3_write(npu, reg, val);
reg = NPU3_RELAXED_SRC(0);
val = NPU3_RELAXED_SRC_MASK_NPU;
npu3_write(npu, reg, val);
npu3_for_each_nvlink_dev(dev, npu)
npu3_dev_init_hw(dev);
}
/* PCI command register (BAR enable/disable) */
static int64_t npu3_cfg_cmd(void *pvd,
struct pci_cfg_reg_filter *pcrf __unused,
uint32_t offset, uint32_t size,
uint32_t *data, bool write)
{
struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
if (!write)
return OPAL_PARTIAL;
if (offset != PCI_CFG_CMD)
return OPAL_PARAMETER;
if (size != 1 && size != 2 && size != 4)
return OPAL_PARAMETER;
npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN));
return OPAL_PARTIAL;
}
static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask,
uint32_t data)
{
if (data != 0xffffffff)
return OPAL_HARDWARE;
/* Return BAR size on next read */
bar->trap |= mask;
return OPAL_SUCCESS;
}
static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask,
uint32_t *data)
{
if (!(bar->trap & mask))
return OPAL_PARTIAL;
*data = GETFIELD(mask, bar->size);
bar->trap &= ~mask;
return OPAL_SUCCESS;
}
/* PCI BAR registers (NTL/GENID) */
static int64_t npu3_cfg_bar(void *pvd __unused,
struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t size, uint32_t *data,
bool write)
{
struct npu3_bar *bar = (struct npu3_bar *)pcrf->data;
uint64_t mask;
if (size != 4)
return OPAL_PARAMETER;
if (offset == pcrf->start)
mask = 0xffffffff;
else if (offset == pcrf->start + 4)
mask = 0xffffffffull << 32;
else
return OPAL_PARAMETER;
if (write)
return npu3_cfg_bar_write(bar, mask, *data);
return npu3_cfg_bar_read(bar, mask, data);
}
/* PCI control register */
static int64_t npu3_cfg_devctl(void *pvd,
struct pci_cfg_reg_filter *pcrf __unused,
uint32_t offset, uint32_t size,
uint32_t *data, bool write)
{
struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
if (!write)
return OPAL_HARDWARE;
if (size != 2 || offset & 1) {
NPU3DEVERR(dev, "Unsupported write to pcie control register\n");
return OPAL_PARAMETER;
}
if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
if (!npu3_dev_reset(dev))
purge_l2_l3_caches();
return OPAL_PARTIAL;
}
static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start,
uint32_t prev_cap)
{
struct pci_virt_device *pvd = dev->nvlink.pvd;
uint32_t val;
/* Add capability list */
PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
/* 0x00 - ID/PCIE capability */
val = PCI_CFG_CAP_ID_EXP;
val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20;
PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
/* 0x04 - Device capability */
val = PCIE_MPSS_128 |
PCIE_PHANTOM_NONE << 3 |
PCIE_L0SL_MAX_NO_LIMIT << 6 |
PCIE_L1L_MAX_NO_LIMIT << 9 |
PCICAP_EXP_DEVCAP_FUNC_RESET;
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
PCI_REG_FLAG_WRITE,
npu3_cfg_devctl, NULL);
/* 0x08 - Device control and status */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
0xffff0000, 0x000f0000);
/* 0x0c - Link capability */
val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4;
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
/* 0x10 - Link control and status */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
0xfffff000, 0xc0000000);
/* 0x14 - Slot capability */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
/* 0x18 - Slot control and status */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
/* 0x1c - Root control and capability */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
0xffffffe0, 0x00000000);
/* 0x20 - Root status */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
0xffffffff, 0x00010000);
/* 0x24 - Device capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
/* 0x28 - Device Control and status 2 */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
0xffff0000, 0x00000000);
/* 0x2c - Link capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
/* 0x30 - Link control and status 2 */
PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
0xffff0000, 0x00200000);
/* 0x34 - Slot capability 2 */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
/* 0x38 - Slot control and status 2 */
PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
return start + PCICAP_EXP_SCTL2 + 8;
}
static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset,
uint32_t data)
{
switch (offset) {
case 0:
NPU3DEVINF(dev, "Ignoring write to status register\n");
break;
case 4:
npu3_dev_procedure_init(dev, data);
break;
default:
return OPAL_PARAMETER;
}
return OPAL_SUCCESS;
}
static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset,
uint32_t *data)
{
switch (offset) {
case 0:
*data = npu3_dev_procedure_status(dev);
break;
case 4:
*data = dev->proc.number;
break;
default:
*data = 0;
return OPAL_PARAMETER;
}
return OPAL_SUCCESS;
}
/* Hardware procedure control/status registers */
static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t size,
uint32_t *data, bool write)
{
struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
if (size != 4)
return OPAL_PARAMETER;
offset -= pcrf->start;
if (write)
return npu3_dev_procedure_write(dev, offset, *data);
return npu3_dev_procedure_read(dev, offset, data);
}
/* PPE SRAM access is indirect via CSAR/CSDR */
static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg)
{
uint64_t val;
val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg);
xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val);
}
static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg,
uint64_t val)
{
npu3_dev_ppe_sram_sel(dev, reg);
xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val);
}
static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg)
{
uint64_t val;
npu3_dev_ppe_sram_sel(dev, reg);
xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val);
return val;
}
/* Software-implemented autonomous link training (SALT) */
static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf,
uint32_t offset, uint32_t size, uint32_t *data,
bool write)
{
struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
unsigned long timeout;
uint32_t cmd_reg;
uint64_t val;
if (size != 4 || offset != pcrf->start)
return OPAL_PARAMETER;
/* The config register before this one holds CMD_REG */
PCI_VIRT_CFG_NORMAL_RD(pvd, pcrf->start - 4, 4, &cmd_reg);
if (cmd_reg == 0xffffffff)
return OPAL_PARAMETER;
/* Check for another command in progress */
val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
if (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
NPU3DEVINF(dev, "SALT_CMD 0x%x: Not ready\n", cmd_reg);
return OPAL_BUSY;
}
val = OB_PPE_SALT_CMD_READY;
val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write);
val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev));
val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg);
if (write)
val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data);
npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val);
/* Wait for the go bit to clear */
timeout = mftb() + msecs_to_tb(1000);
while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
NPU3DEVINF(dev, "SALT_CMD 0x%x: Timeout\n", cmd_reg);
return OPAL_BUSY;
}
val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
}
if (GETFIELD(OB_PPE_SALT_CMD_ERR, val))
NPU3DEVINF(dev, "SALT_CMD 0x%x: Error\n", cmd_reg);
if (!write)
*data = GETFIELD(OB_PPE_SALT_CMD_DATA, val);
return OPAL_SUCCESS;
}
#define VENDOR_CAP_LEN 0x1c
#define VENDOR_CAP_VERSION 0x02
static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev,
uint32_t start, uint32_t prev_cap)
{
struct pci_virt_device *pvd = dev->nvlink.pvd;
/* Capabilities list */
PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
/* Length and version */
PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
/*
* Defaults when the trap can't handle the read/write (eg. due to
* reading/writing less than 4 bytes).
*/
PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
/* PHY procedure trap */
pci_virt_add_filter(pvd, start + 4, 8,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu3_dev_procedure, NULL);
/* Link index */
PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev));
/* SALT registers */
PCI_VIRT_CFG_INIT(pvd, start + 0x10, 4, 0xffffffff, 0, 0);
PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0);
pci_virt_add_filter(pvd, start + 0x14, 4,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu3_dev_salt, NULL);
return start + VENDOR_CAP_LEN;
}
static void npu3_cfg_populate(struct npu3_dev *dev)
{
struct pci_virt_device *pvd = dev->nvlink.pvd;
uint64_t addr;
uint32_t pos;
/* 0x00 - Vendor/Device ID */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
/* 0x04 - Command/Status */
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
0xf9000000);
pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
npu3_cfg_cmd, NULL);
/* 0x08 - Rev/Class/Cache */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102);
/* 0x0c - CLS/Latency Timer/Header/BIST */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
/* 0x10/14 - NTL BAR */
addr = SETFIELD(0xf, dev->ntl_bar.addr,
PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0);
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0);
pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu3_cfg_bar, &dev->ntl_bar);
/* 0x18/1c - GENID BAR */
addr = SETFIELD(0xf, dev->genid_bar.addr,
PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0);
PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0);
pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
npu3_cfg_bar, &dev->genid_bar);
/* 0x20/0x24 - BARs, disabled */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
/* 0x28 - Cardbus CIS pointer */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
/* 0x2c - Subsystem ID */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
/* 0x30 - ROM BAR, zero sized */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
/* 0x34 - PCI Capability */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
/* 0x38 - Reserved */
PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
/* PCIE and vendor specific capability */
pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41);
PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
}
static void npu3_dev_create_pvd(struct npu3_dev *dev)
{
struct npu3 *npu = dev->npu;
struct phb *phb = &npu->nvlink.phb;
dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev);
if (!dev->nvlink.pvd)
return;
phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn);
npu3_cfg_populate(dev);
}
static void npu3_dt_add_mmio_atsd(struct npu3 *npu)
{
struct dt_node *dn = npu->nvlink.phb.dt_node;
uint64_t mmio_atsd[NPU3_XTS_ATSD_MAX];
for (uint32_t i = 0; i < NPU3_XTS_ATSD_MAX; i++)
mmio_atsd[i] = npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(i);
dt_add_property(dn, "ibm,mmio-atsd", mmio_atsd, sizeof(mmio_atsd));
}
static void npu3_dt_add_mmio_window(struct npu3 *npu)
{
struct dt_node *dn = npu->nvlink.phb.dt_node;
uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU;
uint64_t addr, size, win[2];
/* Device MMIO window (NTL/GENID regs only) */
phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL);
phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size);
win[1] = addr + size - win[0];
dt_add_property(dn, "ibm,mmio-window", win, sizeof(win));
dt_add_property_cells(dn, "ranges", 0x02000000,
hi32(win[0]), lo32(win[0]),
hi32(win[0]), lo32(win[0]),
hi32(win[1]), lo32(win[1]));
}
/* NDL No-Stall Event level */
static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev)
{
const uint32_t level[12] = { 1, 3, 5, 7, 9, 11,
43, 45, 47, 49, 51, 53 };
return level[npu3_chip_dev_index(dev)];
}
static void npu3_dt_add_interrupts(struct npu3 *npu)
{
struct dt_node *dn = npu->nvlink.phb.dt_node;
uint32_t *map, icsp, i = 0;
struct npu3_dev *dev;
size_t map_size = 0;
npu3_for_each_nvlink_dev(dev, npu)
map_size += sizeof(*map) * 7;
if (!map_size)
return;
icsp = get_ics_phandle();
map = zalloc(map_size);
assert(map);
npu3_for_each_nvlink_dev(dev, npu) {
map[i] = dev->nvlink.pvd->bdfn << 8;
map[i + 3] = 1; /* INT A */
map[i + 4] = icsp; /* interrupt-parent */
map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev);
map[i + 6] = 0; /* 0 = EDGE, 1 = LEVEL */
i += 7;
}
dt_add_property_cells(dn, "interrupt-parent", icsp);
dt_add_property(dn, "interrupt-map", map, map_size);
dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7);
free(map);
}
/* Populate PCI root device node */
static void npu3_dt_add_props(struct npu3 *npu)
{
struct dt_node *dn = npu->nvlink.phb.dt_node;
dt_add_property_cells(dn, "#address-cells", 3);
dt_add_property_cells(dn, "#size-cells", 2);
dt_add_property_cells(dn, "#interrupt-cells", 1);
dt_add_property_cells(dn, "bus-range", 0, 0xff);
dt_add_property_cells(dn, "clock-frequency", 0x200, 0);
dt_add_property_strings(dn, "device_type", "pciex");
/*
* To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb. The added
* ibm,ioda3-npu3-phb allows for possible quirks.
*/
dt_add_property_strings(dn, "compatible",
"ibm,power9-npu-pciex",
"ibm,ioda2-npu2-phb",
"ibm,ioda2-npu3-phb");
dt_add_property_cells(dn, "ibm,phb-index",
npu3_get_phb_index(npu->index));
dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0);
dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM);
dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM);
dt_add_property_cells(dn, "ibm,supported-tce-sizes",
12, /* 4K */
16, /* 64K */
24, /* 16M */
28); /* 256M */
dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id);
dt_add_property_cells(dn, "ibm,npu-index", npu->index);
dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle);
dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base);
dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU);
dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs));
npu3_dt_add_mmio_atsd(npu);
npu3_dt_add_mmio_window(npu);
npu3_dt_add_interrupts(npu);
}
void npu3_init_nvlink(struct npu3 *npu)
{
struct npu3_dev *dev;
if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK))
return;
npu3_init_hw(npu);
npu3_create_phb(npu);
npu3_for_each_nvlink_dev(dev, npu)
npu3_dev_create_pvd(dev);
npu3_dt_add_props(npu);
/* TODO: Sort out if/why we still can't enable this */
disable_fast_reboot("NVLink device enabled");
}
static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index,
uint64_t msr)
{
uint64_t map, old_map;
/* Unfiltered XTS mode; index is lparshort */
map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index);
/* Enable this mapping for both real and virtual addresses */
map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1;
/* Enable TLBIE/MMIOSD forwarding for this entry */
map |= NPU3_XTS_PID_MAP_VALID_ATSD;
/* Set the relevant MSR bits */
if (msr & MSR_DR)
map |= NPU3_XTS_PID_MAP_MSR_DR;
if (msr & MSR_HV)
map |= NPU3_XTS_PID_MAP_MSR_HV;
if (msr & MSR_PR)
map |= NPU3_XTS_PID_MAP_MSR_PR;
/* We don't support anything other than 64-bit so hardcode it here */
map |= NPU3_XTS_PID_MAP_MSR_SF;
old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index));
/* Error out if this entry is already set with different msr bits */
if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) !=
GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) {
NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__);
return OPAL_PARAMETER;
}
if (!old_map) {
NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map);
npu3_write(npu, NPU3_XTS_PID_MAP(index), map);
}
npu->nvlink.ctx_ref[index]++;
return OPAL_SUCCESS;
}
#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
/*
* Allocate a context ID and initialize the tables with the relevant
* information. Returns the ID or error if one couldn't be allocated.
*/
int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint32_t lparshort, i;
uint64_t map;
int64_t rc;
/*
* MSR bits should be masked by the caller to allow for future
* expansion if required.
*/
if (msr & ~NPU3_VALID_ATS_MSR_BITS)
return OPAL_UNSUPPORTED;
lock(&npu->lock);
for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
break;
}
if (i == NPU3_XTS_BDF_MAP_MAX) {
NPU3ERR(npu, "LPARID not associated with any GPU\n");
rc = OPAL_PARAMETER;
goto out;
}
lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n",
lparshort, PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
rc = npu3_init_context_pid(npu, lparshort, msr);
if (rc)
goto out;
if (!(map & NPU3_XTS_BDF_MAP_VALID)) {
map |= NPU3_XTS_BDF_MAP_VALID;
npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
}
rc = lparshort;
out:
unlock(&npu->lock);
return rc;
}
static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index)
{
if (!npu->nvlink.ctx_ref[index])
return OPAL_PARAMETER;
/* Only destroy when refcount hits 0 */
if (--npu->nvlink.ctx_ref[index])
return OPAL_PARTIAL;
NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index);
npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull);
return OPAL_SUCCESS;
}
int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
uint32_t lparshort, i;
int64_t map, rc;
lock(&npu->lock);
for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
break;
}
if (i == NPU3_XTS_BDF_MAP_MAX) {
NPU3ERR(npu, "LPARID not associated with any GPU\n");
rc = OPAL_PARAMETER;
goto out;
}
lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
rc = npu3_destroy_context_pid(npu, lparshort);
out:
unlock(&npu->lock);
return rc;
}
/* Map the given virtual bdf to lparid with given lpcr */
int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
uint64_t lpcr)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
struct npu3_dev *dev;
int64_t rc = OPAL_SUCCESS;
uint64_t map, val;
uint32_t i;
/*
* The LPCR bits are only required for hash based ATS, which we don't
* currently support, but may need to in the future.
*/
if (lpcr)
return OPAL_UNSUPPORTED;
lock(&npu->lock);
/* Update the entry if it already exists */
for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
break;
}
if (i == NPU3_XTS_BDF_MAP_MAX) {
/* No existing mapping found, find space for a new one */
for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++)
if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i)))
break;
}
if (i == NPU3_XTS_BDF_MAP_MAX) {
NPU3ERR(npu, "No free XTS_BDF[] entry\n");
rc = OPAL_RESOURCE;
goto out;
}
map = NPU3_XTS_BDF_MAP_UNFILT;
map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf);
map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid);
map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i);
/* We only support radix at the moment */
map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3);
/* Find a link on which to send ATSDs for this device */
npu3_for_each_nvlink_dev(dev, npu)
if (dev->nvlink.gpu->bdfn == bdf)
break;
if (!dev || dev->nvlink.gpu->bdfn != bdf) {
NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n",
PCI_BUS_NUM(bdf), PCI_DEV(bdf), PCI_FUNC(bdf));
rc = OPAL_PARAMETER;
goto out;
}
map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index);
NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map);
npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
/* We need to allocate an ATSD per link */
val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid);
if (!lparid)
val |= NPU3_XTS_ATSD_HYP_MSR_HV;
npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val);
out:
unlock(&npu->lock);
return rc;
}
static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src)
{
struct npu3_dev *dev;
uint32_t i;
for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
return OPAL_SUCCESS; /* Already enabled */
/* Find somewhere to write this source */
for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
if (!npu3_read(npu, NPU3_RELAXED_SRC(i)))
break;
if (i == NPU3_RELAXED_SRC_MAX) {
NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n");
return OPAL_RESOURCE;
}
npu3_write(npu, NPU3_RELAXED_SRC(i), src);
npu3_for_each_nvlink_dev(dev, npu) {
uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) |
NPU3_RELAXED_CFG2_SRC_RDENA(i);
npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
}
return OPAL_SUCCESS;
}
static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src)
{
struct npu3_dev *dev;
uint32_t i;
for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
break;
if (i == NPU3_RELAXED_SRC_MAX)
return; /* Already disabled */
npu3_for_each_nvlink_dev(dev, npu) {
uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i);
val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i);
npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
}
npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull);
}
/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */
int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
bool enable)
{
struct npu3 *npu = npu3_phb_to_npu(phb);
int64_t rc = OPAL_SUCCESS;
uint64_t src;
NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n",
enable ? "Enabling" : "Disabling",
pec, gcid);
lock(&npu->lock);
src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid);
src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec);
src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0);
src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47);
src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0);
src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23);
if (enable)
rc = npu3_relaxed_order_enable(npu, src);
else
npu3_relaxed_order_disable(npu, src);
unlock(&npu->lock);
return rc;
}