Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-5.0-20200203' into staging

ppc patch queue 2020-02093

This pull request supersedes ppc-for-5.0-20200131.  The only changes
are one extra patch to suppress some irritating warnings during tests
under TCG, and an extra Tested-by in one of the other patches.

Here's the next batch of patches for ppc and associated machine types.
Highlights includes:
 * Remove the deprecated "prep" machine type and its OpenHackware
   firmware
 * Add TCG emulation of the msgsndp etc. supervisor privileged
   doorbell instructions
 * Allow "pnv" machine type to run Hostboot style firmwares
 * Add a virtual TPM device for spapr machines
 * Implement devices for POWER8 PHB3 and POWER9 PHB4 host bridges for
   the pnv machine type
 * Use faster Spectre mitigation by default for POWER9 DD2.3 machines
 * Introduce Firmware Assisted NMI dump facility for spapr machines
 * Fix a performance regression with load/store multiple instructions
   in TCG

as well as some other assorted cleanups and fixes.

# gpg: Signature made Mon 03 Feb 2020 03:30:24 GMT
# gpg:                using RSA key 75F46586AE61A66CC44E87DC6C38CACA20D9B392
# gpg: Good signature from "David Gibson <david@gibson.dropbear.id.au>" [full]
# gpg:                 aka "David Gibson (Red Hat) <dgibson@redhat.com>" [full]
# gpg:                 aka "David Gibson (ozlabs.org) <dgibson@ozlabs.org>" [full]
# gpg:                 aka "David Gibson (kernel.org) <dwg@kernel.org>" [unknown]
# Primary key fingerprint: 75F4 6586 AE61 A66C C44E  87DC 6C38 CACA 20D9 B392

* remotes/dgibson/tags/ppc-for-5.0-20200203: (35 commits)
  tests: Silence various warnings with pseries
  target/ppc: Use probe_write for DCBZ
  target/ppc: Remove redundant mask in DCBZ
  target/ppc: Use probe_access for LMW, STMW
  target/ppc: Use probe_access for LSW, STSW
  ppc: spapr: Activate the FWNMI functionality
  migration: Include migration support for machine check handling
  ppc: spapr: Handle "ibm,nmi-register" and "ibm,nmi-interlock" RTAS calls
  target/ppc: Build rtas error log upon an MCE
  target/ppc: Handle NMI guest exit
  ppc: spapr: Introduce FWNMI capability
  Wrapper function to wait on condition for the main loop mutex
  target/ppc/cpu.h: Put macro parameter in parentheses
  spapr: Enable DD2.3 accelerated count cache flush in pseries-5.0 machine
  ppc/pnv: change the PowerNV machine devices to be non user creatable
  ppc/pnv: Add models for POWER8 PHB3 PCIe Host bridge
  ppc/pnv: Add models for POWER9 PHB4 PCIe Host bridge
  docs/specs/tpm: reST-ify TPM documentation
  hw/ppc/Kconfig: Enable TPM_SPAPR as part of PSERIES config
  tpm_spapr: Support suspend and resume
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/.gitmodules b/.gitmodules
index 19792c9..9c0501a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,9 +10,6 @@
 [submodule "roms/openbios"]
 	path = roms/openbios
 	url = https://git.qemu.org/git/openbios.git
-[submodule "roms/openhackware"]
-	path = roms/openhackware
-	url = https://git.qemu.org/git/openhackware.git
 [submodule "roms/qemu-palcode"]
 	path = roms/qemu-palcode
 	url = https://git.qemu.org/git/qemu-palcode.git
diff --git a/MAINTAINERS b/MAINTAINERS
index 4ceb1ad..faffd44 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1103,7 +1103,6 @@
 F: hw/rtc/m48t59-isa.c
 F: include/hw/isa/pc87312.h
 F: include/hw/rtc/m48t59.h
-F: pc-bios/ppc_rom.bin
 F: tests/acceptance/ppc_prep_40p.py
 
 sPAPR
diff --git a/Makefile b/Makefile
index 9a5a1e6..3b21c0e 100644
--- a/Makefile
+++ b/Makefile
@@ -784,7 +784,7 @@
 BLOBS=bios.bin bios-256k.bin bios-microvm.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
 vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin vgabios-virtio.bin \
 vgabios-ramfb.bin vgabios-bochs-display.bin vgabios-ati.bin \
-ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \
+openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \
 pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \
 pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \
 efi-e1000.rom efi-eepro100.rom efi-ne2k_pci.rom \
diff --git a/cpus.c b/cpus.c
index b612116..b4f8b84 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1839,6 +1839,11 @@
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
+void qemu_cond_wait_iothread(QemuCond *cond)
+{
+    qemu_cond_wait(cond, &qemu_global_mutex);
+}
+
 static bool all_vcpus_paused(void)
 {
     CPUState *cpu;
diff --git a/docs/interop/firmware.json b/docs/interop/firmware.json
index 8ffb785..240f565 100644
--- a/docs/interop/firmware.json
+++ b/docs/interop/firmware.json
@@ -27,8 +27,7 @@
 #
 # @openfirmware: The interface is defined by the (historical) IEEE
 #                1275-1994 standard. Examples for firmware projects that
-#                provide this interface are: OpenBIOS, OpenHackWare,
-#                SLOF.
+#                provide this interface are: OpenBIOS and SLOF.
 #
 # @uboot: Firmware interface defined by the U-Boot project.
 #
diff --git a/docs/specs/index.rst b/docs/specs/index.rst
index 984ba44..de46a8b 100644
--- a/docs/specs/index.rst
+++ b/docs/specs/index.rst
@@ -13,3 +13,4 @@
    ppc-xive
    ppc-spapr-xive
    acpi_hw_reduced_hotplug
+   tpm
diff --git a/docs/specs/tpm.rst b/docs/specs/tpm.rst
new file mode 100644
index 0000000..2bdf637
--- /dev/null
+++ b/docs/specs/tpm.rst
@@ -0,0 +1,503 @@
+===============
+QEMU TPM Device
+===============
+
+Guest-side hardware interface
+=============================
+
+TIS interface
+-------------
+
+The QEMU TPM emulation implements a TPM TIS hardware interface
+following the Trusted Computing Group's specification "TCG PC Client
+Specific TPM Interface Specification (TIS)", Specification Version
+1.3, 21 March 2013. (see the `TIS specification`_, or a later version
+of it).
+
+The TIS interface makes a memory mapped IO region in the area
+0xfed40000-0xfed44fff available to the guest operating system.
+
+QEMU files related to TPM TIS interface:
+ - ``hw/tpm/tpm_tis.c``
+ - ``hw/tpm/tpm_tis.h``
+
+CRB interface
+-------------
+
+QEMU also implements a TPM CRB interface following the Trusted
+Computing Group's specification "TCG PC Client Platform TPM Profile
+(PTP) Specification", Family "2.0", Level 00 Revision 01.03 v22, May
+22, 2017. (see the `CRB specification`_, or a later version of it)
+
+The CRB interface makes a memory mapped IO region in the area
+0xfed40000-0xfed40fff (1 locality) available to the guest
+operating system.
+
+QEMU files related to TPM CRB interface:
+ - ``hw/tpm/tpm_crb.c``
+
+SPAPR interface
+---------------
+
+pSeries (ppc64) machines offer a tpm-spapr device model.
+
+QEMU files related to the SPAPR interface:
+ - ``hw/tpm/tpm_spapr.c``
+
+fw_cfg interface
+================
+
+The bios/firmware may read the ``"etc/tpm/config"`` fw_cfg entry for
+configuring the guest appropriately.
+
+The entry of 6 bytes has the following content, in little-endian:
+
+.. code-block:: c
+
+    #define TPM_VERSION_UNSPEC          0
+    #define TPM_VERSION_1_2             1
+    #define TPM_VERSION_2_0             2
+
+    #define TPM_PPI_VERSION_NONE        0
+    #define TPM_PPI_VERSION_1_30        1
+
+    struct FwCfgTPMConfig {
+        uint32_t tpmppi_address;         /* PPI memory location */
+        uint8_t tpm_version;             /* TPM version */
+        uint8_t tpmppi_version;          /* PPI version */
+    };
+
+ACPI interface
+==============
+
+The TPM device is defined with ACPI ID "PNP0C31". QEMU builds a SSDT
+and passes it into the guest through the fw_cfg device. The device
+description contains the base address of the TIS interface 0xfed40000
+and the size of the MMIO area (0x5000). In case a TPM2 is used by
+QEMU, a TPM2 ACPI table is also provided.  The device is described to
+be used in polling mode rather than interrupt mode primarily because
+no unused IRQ could be found.
+
+To support measurement logs to be written by the firmware,
+e.g. SeaBIOS, a TCPA table is implemented. This table provides a 64kb
+buffer where the firmware can write its log into. For TPM 2 only a
+more recent version of the TPM2 table provides support for
+measurements logs and a TCPA table does not need to be created.
+
+The TCPA and TPM2 ACPI tables follow the Trusted Computing Group
+specification "TCG ACPI Specification" Family "1.2" and "2.0", Level
+00 Revision 00.37. (see the `ACPI specification`_, or a later version
+of it)
+
+ACPI PPI Interface
+------------------
+
+QEMU supports the Physical Presence Interface (PPI) for TPM 1.2 and
+TPM 2. This interface requires ACPI and firmware support. (see the
+`PPI specification`_)
+
+PPI enables a system administrator (root) to request a modification to
+the TPM upon reboot. The PPI specification defines the operation
+requests and the actions the firmware has to take. The system
+administrator passes the operation request number to the firmware
+through an ACPI interface which writes this number to a memory
+location that the firmware knows. Upon reboot, the firmware finds the
+number and sends commands to the TPM. The firmware writes the TPM
+result code and the operation request number to a memory location that
+ACPI can read from and pass the result on to the administrator.
+
+The PPI specification defines a set of mandatory and optional
+operations for the firmware to implement. The ACPI interface also
+allows an administrator to list the supported operations. In QEMU the
+ACPI code is generated by QEMU, yet the firmware needs to implement
+support on a per-operations basis, and different firmwares may support
+a different subset. Therefore, QEMU introduces the virtual memory
+device for PPI where the firmware can indicate which operations it
+supports and ACPI can enable the ones that are supported and disable
+all others. This interface lies in main memory and has the following
+layout:
+
+ +-------------+--------+--------+-------------------------------------------+
+ |  Field      | Length | Offset | Description                               |
+ +=============+========+========+===========================================+
+ | ``func``    |  0x100 |  0x000 | Firmware sets values for each supported   |
+ |             |        |        | operation. See defined values below.      |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``ppin``    |   0x1  |  0x100 | SMI interrupt to use. Set by firmware.    |
+ |             |        |        | Not supported.                            |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``ppip``    |   0x4  |  0x101 | ACPI function index to pass to SMM code.  |
+ |             |        |        | Set by ACPI. Not supported.               |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``pprp``    |   0x4  |  0x105 | Result of last executed operation. Set by |
+ |             |        |        | firmware. See function index 5 for values.|
+ +-------------+--------+--------+-------------------------------------------+
+ | ``pprq``    |   0x4  |  0x109 | Operation request number to execute. See  |
+ |             |        |        | 'Physical Presence Interface Operation    |
+ |             |        |        | Summary' tables in specs. Set by ACPI.    |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``pprm``    |   0x4  |  0x10d | Operation request optional parameter.     |
+ |             |        |        | Values depend on operation. Set by ACPI.  |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``lppr``    |   0x4  |  0x111 | Last executed operation request number.   |
+ |             |        |        | Copied from pprq field by firmware.       |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``fret``    |   0x4  |  0x115 | Result code from SMM function.            |
+ |             |        |        | Not supported.                            |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``res1``    |  0x40  |  0x119 | Reserved for future use                   |
+ +-------------+--------+--------+-------------------------------------------+
+ |``next_step``|   0x1  |  0x159 | Operation to execute after reboot by      |
+ |             |        |        | firmware. Used by firmware.               |
+ +-------------+--------+--------+-------------------------------------------+
+ | ``movv``    |   0x1  |  0x15a | Memory overwrite variable                 |
+ +-------------+--------+--------+-------------------------------------------+
+
+The following values are supported for the ``func`` field. They
+correspond to the values used by ACPI function index 8.
+
+ +----------+-------------------------------------------------------------+
+ | Value    | Description                                                 |
+ +==========+=============================================================+
+ | 0        | Operation is not implemented.                               |
+ +----------+-------------------------------------------------------------+
+ | 1        | Operation is only accessible through firmware.              |
+ +----------+-------------------------------------------------------------+
+ | 2        | Operation is blocked for OS by firmware configuration.      |
+ +----------+-------------------------------------------------------------+
+ | 3        | Operation is allowed and physically present user required.  |
+ +----------+-------------------------------------------------------------+
+ | 4        | Operation is allowed and physically present user is not     |
+ |          | required.                                                   |
+ +----------+-------------------------------------------------------------+
+
+The location of the table is given by the fw_cfg ``tpmppi_address``
+field.  The PPI memory region size is 0x400 (``TPM_PPI_ADDR_SIZE``) to
+leave enough room for future updates.
+
+QEMU files related to TPM ACPI tables:
+ - ``hw/i386/acpi-build.c``
+ - ``include/hw/acpi/tpm.h``
+
+TPM backend devices
+===================
+
+The TPM implementation is split into two parts, frontend and
+backend. The frontend part is the hardware interface, such as the TPM
+TIS interface described earlier, and the other part is the TPM backend
+interface. The backend interfaces implement the interaction with a TPM
+device, which may be a physical or an emulated device. The split
+between the front- and backend devices allows a frontend to be
+connected with any available backend. This enables the TIS interface
+to be used with the passthrough backend or the swtpm backend.
+
+QEMU files related to TPM backends:
+ - ``backends/tpm.c``
+ - ``include/sysemu/tpm_backend.h``
+ - ``include/sysemu/tpm_backend_int.h``
+
+The QEMU TPM passthrough device
+-------------------------------
+
+In case QEMU is run on Linux as the host operating system it is
+possible to make the hardware TPM device available to a single QEMU
+guest. In this case the user must make sure that no other program is
+using the device, e.g., /dev/tpm0, before trying to start QEMU with
+it.
+
+The passthrough driver uses the host's TPM device for sending TPM
+commands and receiving responses from. Besides that it accesses the
+TPM device's sysfs entry for support of command cancellation. Since
+none of the state of a hardware TPM can be migrated between hosts,
+virtual machine migration is disabled when the TPM passthrough driver
+is used.
+
+Since the host's TPM device will already be initialized by the host's
+firmware, certain commands, e.g. ``TPM_Startup()``, sent by the
+virtual firmware for device initialization, will fail. In this case
+the firmware should not use the TPM.
+
+Sharing the device with the host is generally not a recommended usage
+scenario for a TPM device. The primary reason for this is that two
+operating systems can then access the device's single set of
+resources, such as platform configuration registers
+(PCRs). Applications or kernel security subsystems, such as the Linux
+Integrity Measurement Architecture (IMA), are not expecting to share
+PCRs.
+
+QEMU files related to the TPM passthrough device:
+ - ``hw/tpm/tpm_passthrough.c``
+ - ``hw/tpm/tpm_util.c``
+ - ``hw/tpm/tpm_util.h``
+
+
+Command line to start QEMU with the TPM passthrough device using the host's
+hardware TPM ``/dev/tpm0``:
+
+.. code-block:: console
+
+  qemu-system-x86_64 -display sdl -accel kvm \
+  -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
+  -tpmdev passthrough,id=tpm0,path=/dev/tpm0 \
+  -device tpm-tis,tpmdev=tpm0 test.img
+
+
+The following commands should result in similar output inside the VM
+with a Linux kernel that either has the TPM TIS driver built-in or
+available as a module:
+
+.. code-block:: console
+
+  # dmesg | grep -i tpm
+  [    0.711310] tpm_tis 00:06: 1.2 TPM (device=id 0x1, rev-id 1)
+
+  # dmesg | grep TCPA
+  [    0.000000] ACPI: TCPA 0x0000000003FFD191C 000032 (v02 BOCHS  \
+      BXPCTCPA 0000001 BXPC 00000001)
+
+  # ls -l /dev/tpm*
+  crw-------. 1 root root 10, 224 Jul 11 10:11 /dev/tpm0
+
+  # find /sys/devices/ | grep pcrs$ | xargs cat
+  PCR-00: 35 4E 3B CE 23 9F 38 59 ...
+  ...
+  PCR-23: 00 00 00 00 00 00 00 00 ...
+
+The QEMU TPM emulator device
+----------------------------
+
+The TPM emulator device uses an external TPM emulator called 'swtpm'
+for sending TPM commands to and receiving responses from. The swtpm
+program must have been started before trying to access it through the
+TPM emulator with QEMU.
+
+The TPM emulator implements a command channel for transferring TPM
+commands and responses as well as a control channel over which control
+commands can be sent. (see the `SWTPM protocol`_ specification)
+
+The control channel serves the purpose of resetting, initializing, and
+migrating the TPM state, among other things.
+
+The swtpm program behaves like a hardware TPM and therefore needs to
+be initialized by the firmware running inside the QEMU virtual
+machine.  One necessary step for initializing the device is to send
+the TPM_Startup command to it. SeaBIOS, for example, has been
+instrumented to initialize a TPM 1.2 or TPM 2 device using this
+command.
+
+QEMU files related to the TPM emulator device:
+ - ``hw/tpm/tpm_emulator.c``
+ - ``hw/tpm/tpm_util.c``
+ - ``hw/tpm/tpm_util.h``
+
+The following commands start the swtpm with a UnixIO control channel over
+a socket interface. They do not need to be run as root.
+
+.. code-block:: console
+
+  mkdir /tmp/mytpm1
+  swtpm socket --tpmstate dir=/tmp/mytpm1 \
+    --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
+    --log level=20
+
+Command line to start QEMU with the TPM emulator device communicating
+with the swtpm (x86):
+
+.. code-block:: console
+
+  qemu-system-x86_64 -display sdl -accel kvm \
+    -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
+    -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
+    -tpmdev emulator,id=tpm0,chardev=chrtpm \
+    -device tpm-tis,tpmdev=tpm0 test.img
+
+In case a pSeries machine is emulated, use the following command line:
+
+.. code-block:: console
+
+  qemu-system-ppc64 -display sdl -machine pseries,accel=kvm \
+    -m 1024 -bios slof.bin -boot menu=on \
+    -nodefaults -device VGA -device pci-ohci -device usb-kbd \
+    -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
+    -tpmdev emulator,id=tpm0,chardev=chrtpm \
+    -device tpm-spapr,tpmdev=tpm0 \
+    -device spapr-vscsi,id=scsi0,reg=0x00002000 \
+    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x3,drive=drive-virtio-disk0,id=virtio-disk0 \
+    -drive file=test.img,format=raw,if=none,id=drive-virtio-disk0
+
+In case SeaBIOS is used as firmware, it should show the TPM menu item
+after entering the menu with 'ESC'.
+
+.. code-block:: console
+
+  Select boot device:
+  1. DVD/CD [ata1-0: QEMU DVD-ROM ATAPI-4 DVD/CD]
+  [...]
+  5. Legacy option rom
+
+  t. TPM Configuration
+
+The following commands should result in similar output inside the VM
+with a Linux kernel that either has the TPM TIS driver built-in or
+available as a module:
+
+.. code-block:: console
+
+  # dmesg | grep -i tpm
+  [    0.711310] tpm_tis 00:06: 1.2 TPM (device=id 0x1, rev-id 1)
+
+  # dmesg | grep TCPA
+  [    0.000000] ACPI: TCPA 0x0000000003FFD191C 000032 (v02 BOCHS  \
+      BXPCTCPA 0000001 BXPC 00000001)
+
+  # ls -l /dev/tpm*
+  crw-------. 1 root root 10, 224 Jul 11 10:11 /dev/tpm0
+
+  # find /sys/devices/ | grep pcrs$ | xargs cat
+  PCR-00: 35 4E 3B CE 23 9F 38 59 ...
+  ...
+  PCR-23: 00 00 00 00 00 00 00 00 ...
+
+Migration with the TPM emulator
+===============================
+
+The TPM emulator supports the following types of virtual machine
+migration:
+
+- VM save / restore (migration into a file)
+- Network migration
+- Snapshotting (migration into storage like QoW2 or QED)
+
+The following command sequences can be used to test VM save / restore.
+
+In a 1st terminal start an instance of a swtpm using the following command:
+
+.. code-block:: console
+
+  mkdir /tmp/mytpm1
+  swtpm socket --tpmstate dir=/tmp/mytpm1 \
+    --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
+    --log level=20 --tpm2
+
+In a 2nd terminal start the VM:
+
+.. code-block:: console
+
+  qemu-system-x86_64 -display sdl -accel kvm \
+    -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
+    -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
+    -tpmdev emulator,id=tpm0,chardev=chrtpm \
+    -device tpm-tis,tpmdev=tpm0 \
+    -monitor stdio \
+    test.img
+
+Verify that the attached TPM is working as expected using applications
+inside the VM.
+
+To store the state of the VM use the following command in the QEMU
+monitor in the 2nd terminal:
+
+.. code-block:: console
+
+  (qemu) migrate "exec:cat > testvm.bin"
+  (qemu) quit
+
+At this point a file called ``testvm.bin`` should exists and the swtpm
+and QEMU processes should have ended.
+
+To test 'VM restore' you have to start the swtpm with the same
+parameters as before. If previously a TPM 2 [--tpm2] was saved, --tpm2
+must now be passed again on the command line.
+
+In the 1st terminal restart the swtpm with the same command line as
+before:
+
+.. code-block:: console
+
+  swtpm socket --tpmstate dir=/tmp/mytpm1 \
+    --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
+    --log level=20 --tpm2
+
+In the 2nd terminal restore the state of the VM using the additional
+'-incoming' option.
+
+.. code-block:: console
+
+  qemu-system-x86_64 -display sdl -accel kvm \
+    -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
+    -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
+    -tpmdev emulator,id=tpm0,chardev=chrtpm \
+    -device tpm-tis,tpmdev=tpm0 \
+    -incoming "exec:cat < testvm.bin" \
+    test.img
+
+Troubleshooting migration
+-------------------------
+
+There are several reasons why migration may fail. In case of problems,
+please ensure that the command lines adhere to the following rules
+and, if possible, that identical versions of QEMU and swtpm are used
+at all times.
+
+VM save and restore:
+
+ - QEMU command line parameters should be identical apart from the
+   '-incoming' option on VM restore
+
+ - swtpm command line parameters should be identical
+
+VM migration to 'localhost':
+
+ - QEMU command line parameters should be identical apart from the
+   '-incoming' option on the destination side
+
+ - swtpm command line parameters should point to two different
+   directories on the source and destination swtpm (--tpmstate dir=...)
+   (especially if different versions of libtpms were to be used on the
+   same machine).
+
+VM migration across the network:
+
+ - QEMU command line parameters should be identical apart from the
+   '-incoming' option on the destination side
+
+ - swtpm command line parameters should be identical
+
+VM Snapshotting:
+ - QEMU command line parameters should be identical
+
+ - swtpm command line parameters should be identical
+
+
+Besides that, migration failure reasons on the swtpm level may include
+the following:
+
+ - the versions of the swtpm on the source and destination sides are
+   incompatible
+
+   - downgrading of TPM state may not be supported
+
+   - the source and destination libtpms were compiled with different
+     compile-time options and the destination side refuses to accept the
+     state
+
+ - different migration keys are used on the source and destination side
+   and the destination side cannot decrypt the migrated state
+   (swtpm ... --migration-key ... )
+
+
+.. _TIS specification:
+   https://trustedcomputinggroup.org/pc-client-work-group-pc-client-specific-tpm-interface-specification-tis/
+
+.. _CRB specification:
+   https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/
+
+
+.. _ACPI specification:
+   https://trustedcomputinggroup.org/tcg-acpi-specification/
+
+.. _PPI specification:
+   https://trustedcomputinggroup.org/resource/tcg-physical-presence-interface-specification/
+
+.. _SWTPM protocol:
+   https://github.com/stefanberger/swtpm/blob/master/man/man3/swtpm_ioctls.pod
diff --git a/docs/specs/tpm.txt b/docs/specs/tpm.txt
deleted file mode 100644
index 9c8cca0..0000000
--- a/docs/specs/tpm.txt
+++ /dev/null
@@ -1,427 +0,0 @@
-QEMU TPM Device
-===============
-
-= Guest-side Hardware Interface =
-
-The QEMU TPM emulation implements a TPM TIS hardware interface following the
-Trusted Computing Group's specification "TCG PC Client Specific TPM Interface
-Specification (TIS)", Specification Version 1.3, 21 March 2013. This
-specification, or a later version of it, can be accessed from the following
-URL:
-
-https://trustedcomputinggroup.org/pc-client-work-group-pc-client-specific-tpm-interface-specification-tis/
-
-The TIS interface makes a memory mapped IO region in the area 0xfed40000 -
-0xfed44fff available to the guest operating system.
-
-
-QEMU files related to TPM TIS interface:
- - hw/tpm/tpm_tis.c
- - hw/tpm/tpm_tis.h
-
-
-QEMU also implements a TPM CRB interface following the Trusted Computing
-Group's specification "TCG PC Client Platform TPM Profile (PTP)
-Specification", Family "2.0", Level 00 Revision 01.03 v22, May 22, 2017.
-This specification, or a later version of it, can be accessed from the
-following URL:
-
-https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/
-
-The CRB interface makes a memory mapped IO region in the area 0xfed40000 -
-0xfed40fff (1 locality) available to the guest operating system.
-
-QEMU files related to TPM CRB interface:
- - hw/tpm/tpm_crb.c
-
-= fw_cfg interface =
-
-The bios/firmware may read the "etc/tpm/config" fw_cfg entry for
-configuring the guest appropriately.
-
-The entry of 6 bytes has the following content, in little-endian:
-
-    #define TPM_VERSION_UNSPEC          0
-    #define TPM_VERSION_1_2             1
-    #define TPM_VERSION_2_0             2
-
-    #define TPM_PPI_VERSION_NONE        0
-    #define TPM_PPI_VERSION_1_30        1
-
-    struct FwCfgTPMConfig {
-        uint32_t tpmppi_address;         /* PPI memory location */
-        uint8_t tpm_version;             /* TPM version */
-        uint8_t tpmppi_version;          /* PPI version */
-    };
-
-= ACPI Interface =
-
-The TPM device is defined with ACPI ID "PNP0C31". QEMU builds a SSDT and passes
-it into the guest through the fw_cfg device. The device description contains
-the base address of the TIS interface 0xfed40000 and the size of the MMIO area
-(0x5000). In case a TPM2 is used by QEMU, a TPM2 ACPI table is also provided.
-The device is described to be used in polling mode rather than interrupt mode
-primarily because no unused IRQ could be found.
-
-To support measurement logs to be written by the firmware, e.g. SeaBIOS, a TCPA
-table is implemented. This table provides a 64kb buffer where the firmware can
-write its log into. For TPM 2 only a more recent version of the TPM2 table
-provides support for measurements logs and a TCPA table does not need to be
-created.
-
-The TCPA and TPM2 ACPI tables follow the Trusted Computing Group specification
-"TCG ACPI Specification" Family "1.2" and "2.0", Level 00 Revision 00.37. This
-specification, or a later version of it, can be accessed from the following
-URL:
-
-https://trustedcomputinggroup.org/tcg-acpi-specification/
-
-== ACPI PPI Interface ==
-
-QEMU supports the Physical Presence Interface (PPI) for TPM 1.2 and TPM 2. This
-interface requires ACPI and firmware support. The specification can be found at
-the following URL:
-
-https://trustedcomputinggroup.org/resource/tcg-physical-presence-interface-specification/
-
-PPI enables a system administrator (root) to request a modification to the
-TPM upon reboot. The PPI specification defines the operation requests and the
-actions the firmware has to take. The system administrator passes the operation
-request number to the firmware through an ACPI interface which writes this
-number to a memory location that the firmware knows. Upon reboot, the firmware
-finds the number and sends commands to the TPM. The firmware writes the TPM
-result code and the operation request number to a memory location that ACPI can
-read from and pass the result on to the administrator.
-
-The PPI specification defines a set of mandatory and optional operations for
-the firmware to implement. The ACPI interface also allows an administrator to
-list the supported operations. In QEMU the ACPI code is generated by QEMU, yet
-the firmware needs to implement support on a per-operations basis, and
-different firmwares may support a different subset. Therefore, QEMU introduces
-the virtual memory device for PPI where the firmware can indicate which
-operations it supports and ACPI can enable the ones that are supported and
-disable all others. This interface lies in main memory and has the following
-layout:
-
- +----------+--------+--------+-------------------------------------------+
- |  Field   | Length | Offset | Description                               |
- +----------+--------+--------+-------------------------------------------+
- | func     |  0x100 |  0x000 | Firmware sets values for each supported   |
- |          |        |        | operation. See defined values below.      |
- +----------+--------+--------+-------------------------------------------+
- | ppin     |   0x1  |  0x100 | SMI interrupt to use. Set by firmware.    |
- |          |        |        | Not supported.                            |
- +----------+--------+--------+-------------------------------------------+
- | ppip     |   0x4  |  0x101 | ACPI function index to pass to SMM code.  |
- |          |        |        | Set by ACPI. Not supported.               |
- +----------+--------+--------+-------------------------------------------+
- | pprp     |   0x4  |  0x105 | Result of last executed operation. Set by |
- |          |        |        | firmware. See function index 5 for values.|
- +----------+--------+--------+-------------------------------------------+
- | pprq     |   0x4  |  0x109 | Operation request number to execute. See  |
- |          |        |        | 'Physical Presence Interface Operation    |
- |          |        |        | Summary' tables in specs. Set by ACPI.    |
- +----------+--------+--------+-------------------------------------------+
- | pprm     |   0x4  |  0x10d | Operation request optional parameter.     |
- |          |        |        | Values depend on operation. Set by ACPI.  |
- +----------+--------+--------+-------------------------------------------+
- | lppr     |   0x4  |  0x111 | Last executed operation request number.   |
- |          |        |        | Copied from pprq field by firmware.       |
- +----------+--------+--------+-------------------------------------------+
- | fret     |   0x4  |  0x115 | Result code from SMM function.            |
- |          |        |        | Not supported.                            |
- +----------+--------+--------+-------------------------------------------+
- | res1     |  0x40  |  0x119 | Reserved for future use                   |
- +----------+--------+--------+-------------------------------------------+
- | next_step|   0x1  |  0x159 | Operation to execute after reboot by      |
- |          |        |        | firmware. Used by firmware.               |
- +----------+--------+--------+-------------------------------------------+
- | movv     |   0x1  |  0x15a | Memory overwrite variable                 |
- +----------+--------+--------+-------------------------------------------+
-
-   The following values are supported for the 'func' field. They correspond
-   to the values used by ACPI function index 8.
-
- +----------+-------------------------------------------------------------+
- | value    | Description                                                 |
- +----------+-------------------------------------------------------------+
- | 0        | Operation is not implemented.                               |
- +----------+-------------------------------------------------------------+
- | 1        | Operation is only accessible through firmware.              |
- +----------+-------------------------------------------------------------+
- | 2        | Operation is blocked for OS by firmware configuration.      |
- +----------+-------------------------------------------------------------+
- | 3        | Operation is allowed and physically present user required.  |
- +----------+-------------------------------------------------------------+
- | 4        | Operation is allowed and physically present user is not     |
- |          | required.                                                   |
- +----------+-------------------------------------------------------------+
-
-The location of the table is given by the fw_cfg tpmppi_address field.
-The PPI memory region size is 0x400 (TPM_PPI_ADDR_SIZE) to leave
-enough room for future updates.
-
-
-QEMU files related to TPM ACPI tables:
- - hw/i386/acpi-build.c
- - include/hw/acpi/tpm.h
-
-
-= TPM backend devices =
-
-The TPM implementation is split into two parts, frontend and backend. The
-frontend part is the hardware interface, such as the TPM TIS interface
-described earlier, and the other part is the TPM backend interface. The backend
-interfaces implement the interaction with a TPM device, which may be a physical
-or an emulated device. The split between the front- and backend devices allows
-a frontend to be connected with any available backend. This enables the TIS
-interface to be used with the passthrough backend or the (future) swtpm backend.
-
-
-QEMU files related to TPM backends:
- - backends/tpm.c
- - include/sysemu/tpm_backend.h
- - include/sysemu/tpm_backend_int.h
-
-
-== The QEMU TPM passthrough device ==
-
-In case QEMU is run on Linux as the host operating system it is possible to
-make the hardware TPM device available to a single QEMU guest. In this case the
-user must make sure that no other program is using the device, e.g., /dev/tpm0,
-before trying to start QEMU with it.
-
-The passthrough driver uses the host's TPM device for sending TPM commands
-and receiving responses from. Besides that it accesses the TPM device's sysfs
-entry for support of command cancellation. Since none of the state of a
-hardware TPM can be migrated between hosts, virtual machine migration is
-disabled when the TPM passthrough driver is used.
-
-Since the host's TPM device will already be initialized by the host's firmware,
-certain commands, e.g. TPM_Startup(), sent by the virtual firmware for device
-initialization, will fail. In this case the firmware should not use the TPM.
-
-Sharing the device with the host is generally not a recommended usage scenario
-for a TPM device. The primary reason for this is that two operating systems can
-then access the device's single set of resources, such as platform configuration
-registers (PCRs). Applications or kernel security subsystems, such as the
-Linux Integrity Measurement Architecture (IMA), are not expecting to share PCRs.
-
-
-QEMU files related to the TPM passthrough device:
- - hw/tpm/tpm_passthrough.c
- - hw/tpm/tpm_util.c
- - hw/tpm/tpm_util.h
-
-
-Command line to start QEMU with the TPM passthrough device using the host's
-hardware TPM /dev/tpm0:
-
-qemu-system-x86_64 -display sdl -accel kvm \
-  -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
-  -tpmdev passthrough,id=tpm0,path=/dev/tpm0 \
-  -device tpm-tis,tpmdev=tpm0 test.img
-
-The following commands should result in similar output inside the VM with a
-Linux kernel that either has the TPM TIS driver built-in or available as a
-module:
-
-#> dmesg | grep -i tpm
-[    0.711310] tpm_tis 00:06: 1.2 TPM (device=id 0x1, rev-id 1)
-
-#> dmesg | grep TCPA
-[    0.000000] ACPI: TCPA 0x0000000003FFD191C 000032 (v02 BOCHS  \
-    BXPCTCPA 0000001 BXPC 00000001)
-
-#> ls -l /dev/tpm*
-crw-------. 1 root root 10, 224 Jul 11 10:11 /dev/tpm0
-
-#> find /sys/devices/ | grep pcrs$ | xargs cat
-PCR-00: 35 4E 3B CE 23 9F 38 59 ...
-...
-PCR-23: 00 00 00 00 00 00 00 00 ...
-
-
-== The QEMU TPM emulator device ==
-
-The TPM emulator device uses an external TPM emulator called 'swtpm' for
-sending TPM commands to and receiving responses from. The swtpm program
-must have been started before trying to access it through the TPM emulator
-with QEMU.
-
-The TPM emulator implements a command channel for transferring TPM commands
-and responses as well as a control channel over which control commands can
-be sent. The specification for the control channel can be found here:
-
-https://github.com/stefanberger/swtpm/blob/master/man/man3/swtpm_ioctls.pod
-
-
-The control channel serves the purpose of resetting, initializing, and
-migrating the TPM state, among other things.
-
-The swtpm program behaves like a hardware TPM and therefore needs to be
-initialized by the firmware running inside the QEMU virtual machine.
-One necessary step for initializing the device is to send the TPM_Startup
-command to it. SeaBIOS, for example, has been instrumented to initialize
-a TPM 1.2 or TPM 2 device using this command.
-
-
-QEMU files related to the TPM emulator device:
- - hw/tpm/tpm_emulator.c
- - hw/tpm/tpm_util.c
- - hw/tpm/tpm_util.h
-
-
-The following commands start the swtpm with a UnixIO control channel over
-a socket interface. They do not need to be run as root.
-
-mkdir /tmp/mytpm1
-swtpm socket --tpmstate dir=/tmp/mytpm1 \
-  --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
-  --log level=20
-
-Command line to start QEMU with the TPM emulator device communicating with
-the swtpm:
-
-qemu-system-x86_64 -display sdl -accel kvm \
-  -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
-  -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
-  -tpmdev emulator,id=tpm0,chardev=chrtpm \
-  -device tpm-tis,tpmdev=tpm0 test.img
-
-
-In case SeaBIOS is used as firmware, it should show the TPM menu item
-after entering the menu with 'ESC'.
-
-Select boot device:
-1. DVD/CD [ata1-0: QEMU DVD-ROM ATAPI-4 DVD/CD]
-[...]
-5. Legacy option rom
-
-t. TPM Configuration
-
-
-The following commands should result in similar output inside the VM with a
-Linux kernel that either has the TPM TIS driver built-in or available as a
-module:
-
-#> dmesg | grep -i tpm
-[    0.711310] tpm_tis 00:06: 1.2 TPM (device=id 0x1, rev-id 1)
-
-#> dmesg | grep TCPA
-[    0.000000] ACPI: TCPA 0x0000000003FFD191C 000032 (v02 BOCHS  \
-    BXPCTCPA 0000001 BXPC 00000001)
-
-#> ls -l /dev/tpm*
-crw-------. 1 root root 10, 224 Jul 11 10:11 /dev/tpm0
-
-#> find /sys/devices/ | grep pcrs$ | xargs cat
-PCR-00: 35 4E 3B CE 23 9F 38 59 ...
-...
-PCR-23: 00 00 00 00 00 00 00 00 ...
-
-
-=== Migration with the TPM emulator ===
-
-The TPM emulator supports the following types of virtual machine migration:
-
-- VM save / restore (migration into a file)
-- Network migration
-- Snapshotting (migration into storage like QoW2 or QED)
-
-The following command sequences can be used to test VM save / restore.
-
-
-In a 1st terminal start an instance of a swtpm using the following command:
-
-mkdir /tmp/mytpm1
-swtpm socket --tpmstate dir=/tmp/mytpm1 \
-  --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
-  --log level=20 --tpm2
-
-In a 2nd terminal start the VM:
-
-qemu-system-x86_64 -display sdl -accel kvm \
-  -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
-  -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
-  -tpmdev emulator,id=tpm0,chardev=chrtpm \
-  -device tpm-tis,tpmdev=tpm0 \
-  -monitor stdio \
-  test.img
-
-Verify that the attached TPM is working as expected using applications inside
-the VM.
-
-To store the state of the VM use the following command in the QEMU monitor in
-the 2nd terminal:
-
-(qemu) migrate "exec:cat > testvm.bin"
-(qemu) quit
-
-At this point a file called 'testvm.bin' should exists and the swtpm and QEMU
-processes should have ended.
-
-To test 'VM restore' you have to start the swtpm with the same parameters
-as before. If previously a TPM 2 [--tpm2] was saved, --tpm2 must now be
-passed again on the command line.
-
-In the 1st terminal restart the swtpm with the same command line as before:
-
-swtpm socket --tpmstate dir=/tmp/mytpm1 \
-  --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock \
-  --log level=20 --tpm2
-
-In the 2nd terminal restore the state of the VM using the additional
-'-incoming' option.
-
-qemu-system-x86_64 -display sdl -accel kvm \
-  -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
-  -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
-  -tpmdev emulator,id=tpm0,chardev=chrtpm \
-  -device tpm-tis,tpmdev=tpm0 \
-  -incoming "exec:cat < testvm.bin" \
-  test.img
-
-
-Troubleshooting migration:
-
-There are several reasons why migration may fail. In case of problems,
-please ensure that the command lines adhere to the following rules and,
-if possible, that identical versions of QEMU and swtpm are used at all
-times.
-
-VM save and restore:
- - QEMU command line parameters should be identical apart from the
-   '-incoming' option on VM restore
- - swtpm command line parameters should be identical
-
-VM migration to 'localhost':
- - QEMU command line parameters should be identical apart from the
-   '-incoming' option on the destination side
- - swtpm command line parameters should point to two different
-   directories on the source and destination swtpm (--tpmstate dir=...)
-   (especially if different versions of libtpms were to be used on the
-   same machine).
-
-VM migration across the network:
- - QEMU command line parameters should be identical apart from the
-   '-incoming' option on the destination side
- - swtpm command line parameters should be identical
-
-VM Snapshotting:
- - QEMU command line parameters should be identical
- - swtpm command line parameters should be identical
-
-
-Besides that, migration failure reasons on the swtpm level may include
-the following:
-
- - the versions of the swtpm on the source and destination sides are
-   incompatible
-   - downgrading of TPM state may not be supported
-   - the source and destination libtpms were compiled with different
-     compile-time options and the destination side refuses to accept the
-     state
- - different migration keys are used on the source and destination side
-   and the destination side cannot decrypt the migrated state
-   (swtpm ... --migration-key ... )
diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index 785b607..c5d507e 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -217,7 +217,7 @@
     }
 }
 
-static void icp_irq(ICSState *ics, int server, int nr, uint8_t priority)
+void icp_irq(ICSState *ics, int server, int nr, uint8_t priority)
 {
     ICPState *icp = xics_icp_get(ics->xics, server);
 
@@ -512,8 +512,14 @@
 
 static void ics_reject(ICSState *ics, uint32_t nr)
 {
+    ICSStateClass *isc = ICS_GET_CLASS(ics);
     ICSIRQState *irq = ics->irqs + nr - ics->offset;
 
+    if (isc->reject) {
+        isc->reject(ics, nr);
+        return;
+    }
+
     trace_xics_ics_reject(nr, nr - ics->offset);
     if (irq->flags & XICS_FLAGS_IRQ_MSI) {
         irq->status |= XICS_STATUS_REJECTED;
@@ -524,8 +530,14 @@
 
 void ics_resend(ICSState *ics)
 {
+    ICSStateClass *isc = ICS_GET_CLASS(ics);
     int i;
 
+    if (isc->resend) {
+        isc->resend(ics);
+        return;
+    }
+
     for (i = 0; i < ics->nr_irqs; i++) {
         /* FIXME: filter by server#? */
         if (ics->irqs[i].flags & XICS_FLAGS_IRQ_LSI) {
diff --git a/hw/pci-host/Makefile.objs b/hw/pci-host/Makefile.objs
index 9c466fa..8c87e84 100644
--- a/hw/pci-host/Makefile.objs
+++ b/hw/pci-host/Makefile.objs
@@ -20,3 +20,5 @@
 common-obj-$(CONFIG_PCI_EXPRESS_XILINX) += xilinx-pcie.o
 
 common-obj-$(CONFIG_PCI_EXPRESS_DESIGNWARE) += designware.o
+obj-$(CONFIG_POWERNV) += pnv_phb4.o pnv_phb4_pec.o
+obj-$(CONFIG_POWERNV) += pnv_phb3.o pnv_phb3_msi.o pnv_phb3_pbcq.o
diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c
new file mode 100644
index 0000000..74618fa
--- /dev/null
+++ b/hw/pci-host/pnv_phb3.c
@@ -0,0 +1,1197 @@
+/*
+ * QEMU PowerPC PowerNV (POWER8) PHB3 model
+ *
+ * Copyright (c) 2014-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/visitor.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/pci-host/pnv_phb3_regs.h"
+#include "hw/pci-host/pnv_phb3.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/ppc/pnv.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+
+#define phb3_error(phb, fmt, ...)                                       \
+    qemu_log_mask(LOG_GUEST_ERROR, "phb3[%d:%d]: " fmt "\n",            \
+                  (phb)->chip_id, (phb)->phb_id, ## __VA_ARGS__)
+
+static PCIDevice *pnv_phb3_find_cfg_dev(PnvPHB3 *phb)
+{
+    PCIHostState *pci = PCI_HOST_BRIDGE(phb);
+    uint64_t addr = phb->regs[PHB_CONFIG_ADDRESS >> 3];
+    uint8_t bus, devfn;
+
+    if (!(addr >> 63)) {
+        return NULL;
+    }
+    bus = (addr >> 52) & 0xff;
+    devfn = (addr >> 44) & 0xff;
+
+    return pci_find_device(pci->bus, bus, devfn);
+}
+
+/*
+ * The CONFIG_DATA register expects little endian accesses, but as the
+ * region is big endian, we have to swap the value.
+ */
+static void pnv_phb3_config_write(PnvPHB3 *phb, unsigned off,
+                                  unsigned size, uint64_t val)
+{
+    uint32_t cfg_addr, limit;
+    PCIDevice *pdev;
+
+    pdev = pnv_phb3_find_cfg_dev(phb);
+    if (!pdev) {
+        return;
+    }
+    cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xffc;
+    cfg_addr |= off;
+    limit = pci_config_size(pdev);
+    if (limit <= cfg_addr) {
+        /*
+         * conventional pci device can be behind pcie-to-pci bridge.
+         * 256 <= addr < 4K has no effects.
+         */
+        return;
+    }
+    switch (size) {
+    case 1:
+        break;
+    case 2:
+        val = bswap16(val);
+        break;
+    case 4:
+        val = bswap32(val);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    pci_host_config_write_common(pdev, cfg_addr, limit, val, size);
+}
+
+static uint64_t pnv_phb3_config_read(PnvPHB3 *phb, unsigned off,
+                                     unsigned size)
+{
+    uint32_t cfg_addr, limit;
+    PCIDevice *pdev;
+    uint64_t val;
+
+    pdev = pnv_phb3_find_cfg_dev(phb);
+    if (!pdev) {
+        return ~0ull;
+    }
+    cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xffc;
+    cfg_addr |= off;
+    limit = pci_config_size(pdev);
+    if (limit <= cfg_addr) {
+        /*
+         * conventional pci device can be behind pcie-to-pci bridge.
+         * 256 <= addr < 4K has no effects.
+         */
+        return ~0ull;
+    }
+    val = pci_host_config_read_common(pdev, cfg_addr, limit, size);
+    switch (size) {
+    case 1:
+        return val;
+    case 2:
+        return bswap16(val);
+    case 4:
+        return bswap32(val);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void pnv_phb3_check_m32(PnvPHB3 *phb)
+{
+    uint64_t base, start, size;
+    MemoryRegion *parent;
+    PnvPBCQState *pbcq = &phb->pbcq;
+
+    if (memory_region_is_mapped(&phb->mr_m32)) {
+        memory_region_del_subregion(phb->mr_m32.container, &phb->mr_m32);
+    }
+
+    if (!(phb->regs[PHB_PHB3_CONFIG >> 3] & PHB_PHB3C_M32_EN)) {
+        return;
+    }
+
+    /* Grab geometry from registers */
+    base = phb->regs[PHB_M32_BASE_ADDR >> 3];
+    start = phb->regs[PHB_M32_START_ADDR >> 3];
+    size = ~(phb->regs[PHB_M32_BASE_MASK >> 3] | 0xfffc000000000000ull) + 1;
+
+    /* Check if it matches an enabled MMIO region in the PBCQ */
+    if (memory_region_is_mapped(&pbcq->mmbar0) &&
+        base >= pbcq->mmio0_base &&
+        (base + size) <= (pbcq->mmio0_base + pbcq->mmio0_size)) {
+        parent = &pbcq->mmbar0;
+        base -= pbcq->mmio0_base;
+    } else if (memory_region_is_mapped(&pbcq->mmbar1) &&
+               base >= pbcq->mmio1_base &&
+               (base + size) <= (pbcq->mmio1_base + pbcq->mmio1_size)) {
+        parent = &pbcq->mmbar1;
+        base -= pbcq->mmio1_base;
+    } else {
+        return;
+    }
+
+    /* Create alias */
+    memory_region_init_alias(&phb->mr_m32, OBJECT(phb), "phb3-m32",
+                             &phb->pci_mmio, start, size);
+    memory_region_add_subregion(parent, base, &phb->mr_m32);
+}
+
+static void pnv_phb3_check_m64(PnvPHB3 *phb, uint32_t index)
+{
+    uint64_t base, start, size, m64;
+    MemoryRegion *parent;
+    PnvPBCQState *pbcq = &phb->pbcq;
+
+    if (memory_region_is_mapped(&phb->mr_m64[index])) {
+        /* Should we destroy it in RCU friendly way... ? */
+        memory_region_del_subregion(phb->mr_m64[index].container,
+                                    &phb->mr_m64[index]);
+    }
+
+    /* Get table entry */
+    m64 = phb->ioda_M64BT[index];
+
+    if (!(m64 & IODA2_M64BT_ENABLE)) {
+        return;
+    }
+
+    /* Grab geometry from registers */
+    base = GETFIELD(IODA2_M64BT_BASE, m64) << 20;
+    if (m64 & IODA2_M64BT_SINGLE_PE) {
+        base &= ~0x1ffffffull;
+    }
+    size = GETFIELD(IODA2_M64BT_MASK, m64) << 20;
+    size |= 0xfffc000000000000ull;
+    size = ~size + 1;
+    start = base | (phb->regs[PHB_M64_UPPER_BITS >> 3]);
+
+    /* Check if it matches an enabled MMIO region in the PBCQ */
+    if (memory_region_is_mapped(&pbcq->mmbar0) &&
+        base >= pbcq->mmio0_base &&
+        (base + size) <= (pbcq->mmio0_base + pbcq->mmio0_size)) {
+        parent = &pbcq->mmbar0;
+        base -= pbcq->mmio0_base;
+    } else if (memory_region_is_mapped(&pbcq->mmbar1) &&
+               base >= pbcq->mmio1_base &&
+               (base + size) <= (pbcq->mmio1_base + pbcq->mmio1_size)) {
+        parent = &pbcq->mmbar1;
+        base -= pbcq->mmio1_base;
+    } else {
+        return;
+    }
+
+    /* Create alias */
+    memory_region_init_alias(&phb->mr_m64[index], OBJECT(phb), "phb3-m64",
+                             &phb->pci_mmio, start, size);
+    memory_region_add_subregion(parent, base, &phb->mr_m64[index]);
+}
+
+static void pnv_phb3_check_all_m64s(PnvPHB3 *phb)
+{
+    uint64_t i;
+
+    for (i = 0; i < PNV_PHB3_NUM_M64; i++) {
+        pnv_phb3_check_m64(phb, i);
+    }
+}
+
+static void pnv_phb3_lxivt_write(PnvPHB3 *phb, unsigned idx, uint64_t val)
+{
+    uint8_t server, prio;
+
+    phb->ioda_LXIVT[idx] = val & (IODA2_LXIVT_SERVER |
+                                  IODA2_LXIVT_PRIORITY |
+                                  IODA2_LXIVT_NODE_ID);
+    server = GETFIELD(IODA2_LXIVT_SERVER, val);
+    prio = GETFIELD(IODA2_LXIVT_PRIORITY, val);
+
+    /*
+     * The low order 2 bits are the link pointer (Type II interrupts).
+     * Shift back to get a valid IRQ server.
+     */
+    server >>= 2;
+
+    ics_write_xive(&phb->lsis, idx, server, prio, prio);
+}
+
+static uint64_t *pnv_phb3_ioda_access(PnvPHB3 *phb,
+                                      unsigned *out_table, unsigned *out_idx)
+{
+    uint64_t adreg = phb->regs[PHB_IODA_ADDR >> 3];
+    unsigned int index = GETFIELD(PHB_IODA_AD_TADR, adreg);
+    unsigned int table = GETFIELD(PHB_IODA_AD_TSEL, adreg);
+    unsigned int mask;
+    uint64_t *tptr = NULL;
+
+    switch (table) {
+    case IODA2_TBL_LIST:
+        tptr = phb->ioda_LIST;
+        mask = 7;
+        break;
+    case IODA2_TBL_LXIVT:
+        tptr = phb->ioda_LXIVT;
+        mask = 7;
+        break;
+    case IODA2_TBL_IVC_CAM:
+    case IODA2_TBL_RBA:
+        mask = 31;
+        break;
+    case IODA2_TBL_RCAM:
+        mask = 63;
+        break;
+    case IODA2_TBL_MRT:
+        mask = 7;
+        break;
+    case IODA2_TBL_PESTA:
+    case IODA2_TBL_PESTB:
+        mask = 255;
+        break;
+    case IODA2_TBL_TVT:
+        tptr = phb->ioda_TVT;
+        mask = 511;
+        break;
+    case IODA2_TBL_TCAM:
+    case IODA2_TBL_TDR:
+        mask = 63;
+        break;
+    case IODA2_TBL_M64BT:
+        tptr = phb->ioda_M64BT;
+        mask = 15;
+        break;
+    case IODA2_TBL_M32DT:
+        tptr = phb->ioda_MDT;
+        mask = 255;
+        break;
+    case IODA2_TBL_PEEV:
+        tptr = phb->ioda_PEEV;
+        mask = 3;
+        break;
+    default:
+        phb3_error(phb, "invalid IODA table %d", table);
+        return NULL;
+    }
+    index &= mask;
+    if (out_idx) {
+        *out_idx = index;
+    }
+    if (out_table) {
+        *out_table = table;
+    }
+    if (tptr) {
+        tptr += index;
+    }
+    if (adreg & PHB_IODA_AD_AUTOINC) {
+        index = (index + 1) & mask;
+        adreg = SETFIELD(PHB_IODA_AD_TADR, adreg, index);
+    }
+    phb->regs[PHB_IODA_ADDR >> 3] = adreg;
+    return tptr;
+}
+
+static uint64_t pnv_phb3_ioda_read(PnvPHB3 *phb)
+{
+        unsigned table;
+        uint64_t *tptr;
+
+        tptr = pnv_phb3_ioda_access(phb, &table, NULL);
+        if (!tptr) {
+            /* Return 0 on unsupported tables, not ff's */
+            return 0;
+        }
+        return *tptr;
+}
+
+static void pnv_phb3_ioda_write(PnvPHB3 *phb, uint64_t val)
+{
+        unsigned table, idx;
+        uint64_t *tptr;
+
+        tptr = pnv_phb3_ioda_access(phb, &table, &idx);
+        if (!tptr) {
+            return;
+        }
+
+        /* Handle side effects */
+        switch (table) {
+        case IODA2_TBL_LXIVT:
+            pnv_phb3_lxivt_write(phb, idx, val);
+            break;
+        case IODA2_TBL_M64BT:
+            *tptr = val;
+            pnv_phb3_check_m64(phb, idx);
+            break;
+        default:
+            *tptr = val;
+        }
+}
+
+/*
+ * This is called whenever the PHB LSI, MSI source ID register or
+ * the PBCQ irq filters are written.
+ */
+void pnv_phb3_remap_irqs(PnvPHB3 *phb)
+{
+    ICSState *ics = &phb->lsis;
+    uint32_t local, global, count, mask, comp;
+    uint64_t baren;
+    PnvPBCQState *pbcq = &phb->pbcq;
+
+    /*
+     * First check if we are enabled. Unlike real HW we don't separate
+     * TX and RX so we enable if both are set
+     */
+    baren = pbcq->nest_regs[PBCQ_NEST_BAR_EN];
+    if (!(baren & PBCQ_NEST_BAR_EN_IRSN_RX) ||
+        !(baren & PBCQ_NEST_BAR_EN_IRSN_TX)) {
+        ics->offset = 0;
+        return;
+    }
+
+    /* Grab local LSI source ID */
+    local = GETFIELD(PHB_LSI_SRC_ID, phb->regs[PHB_LSI_SOURCE_ID >> 3]) << 3;
+
+    /* Grab global one and compare */
+    global = GETFIELD(PBCQ_NEST_LSI_SRC,
+                      pbcq->nest_regs[PBCQ_NEST_LSI_SRC_ID]) << 3;
+    if (global != local) {
+        /*
+         * This happens during initialization, let's come back when we
+         * are properly configured
+         */
+        ics->offset = 0;
+        return;
+    }
+
+    /* Get the base on the powerbus */
+    comp = GETFIELD(PBCQ_NEST_IRSN_COMP,
+                    pbcq->nest_regs[PBCQ_NEST_IRSN_COMPARE]);
+    mask = GETFIELD(PBCQ_NEST_IRSN_COMP,
+                    pbcq->nest_regs[PBCQ_NEST_IRSN_MASK]);
+    count = ((~mask) + 1) & 0x7ffff;
+    phb->total_irq = count;
+
+    /* Sanity checks */
+    if ((global + PNV_PHB3_NUM_LSI) > count) {
+        phb3_error(phb, "LSIs out of reach: LSI base=%d total irq=%d", global,
+                   count);
+    }
+
+    if (count > 2048) {
+        phb3_error(phb, "More interrupts than supported: %d", count);
+    }
+
+    if ((comp & mask) != comp) {
+        phb3_error(phb, "IRQ compare bits not in mask: comp=0x%x mask=0x%x",
+                   comp, mask);
+        comp &= mask;
+    }
+    /* Setup LSI offset */
+    ics->offset = comp + global;
+
+    /* Setup MSI offset */
+    pnv_phb3_msi_update_config(&phb->msis, comp, count - PNV_PHB3_NUM_LSI);
+}
+
+static void pnv_phb3_lsi_src_id_write(PnvPHB3 *phb, uint64_t val)
+{
+    /* Sanitize content */
+    val &= PHB_LSI_SRC_ID;
+    phb->regs[PHB_LSI_SOURCE_ID >> 3] = val;
+    pnv_phb3_remap_irqs(phb);
+}
+
+static void pnv_phb3_rtc_invalidate(PnvPHB3 *phb, uint64_t val)
+{
+    PnvPhb3DMASpace *ds;
+
+    /* Always invalidate all for now ... */
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        ds->pe_num = PHB_INVALID_PE;
+    }
+}
+
+
+static void pnv_phb3_update_msi_regions(PnvPhb3DMASpace *ds)
+{
+    uint64_t cfg = ds->phb->regs[PHB_PHB3_CONFIG >> 3];
+
+    if (cfg & PHB_PHB3C_32BIT_MSI_EN) {
+        if (!memory_region_is_mapped(&ds->msi32_mr)) {
+            memory_region_add_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        0xffff0000, &ds->msi32_mr);
+        }
+    } else {
+        if (memory_region_is_mapped(&ds->msi32_mr)) {
+            memory_region_del_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        &ds->msi32_mr);
+        }
+    }
+
+    if (cfg & PHB_PHB3C_64BIT_MSI_EN) {
+        if (!memory_region_is_mapped(&ds->msi64_mr)) {
+            memory_region_add_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        (1ull << 60), &ds->msi64_mr);
+        }
+    } else {
+        if (memory_region_is_mapped(&ds->msi64_mr)) {
+            memory_region_del_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        &ds->msi64_mr);
+        }
+    }
+}
+
+static void pnv_phb3_update_all_msi_regions(PnvPHB3 *phb)
+{
+    PnvPhb3DMASpace *ds;
+
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        pnv_phb3_update_msi_regions(ds);
+    }
+}
+
+void pnv_phb3_reg_write(void *opaque, hwaddr off, uint64_t val, unsigned size)
+{
+    PnvPHB3 *phb = opaque;
+    bool changed;
+
+    /* Special case configuration data */
+    if ((off & 0xfffc) == PHB_CONFIG_DATA) {
+        pnv_phb3_config_write(phb, off & 0x3, size, val);
+        return;
+    }
+
+    /* Other registers are 64-bit only */
+    if (size != 8 || off & 0x7) {
+        phb3_error(phb, "Invalid register access, offset: 0x%"PRIx64" size: %d",
+                   off, size);
+        return;
+    }
+
+    /* Handle masking & filtering */
+    switch (off) {
+    case PHB_M64_UPPER_BITS:
+        val &= 0xfffc000000000000ull;
+        break;
+    case PHB_Q_DMA_R:
+        /*
+         * This is enough logic to make SW happy but we aren't actually
+         * quiescing the DMAs
+         */
+        if (val & PHB_Q_DMA_R_AUTORESET) {
+            val = 0;
+        } else {
+            val &= PHB_Q_DMA_R_QUIESCE_DMA;
+        }
+        break;
+    /* LEM stuff */
+    case PHB_LEM_FIR_AND_MASK:
+        phb->regs[PHB_LEM_FIR_ACCUM >> 3] &= val;
+        return;
+    case PHB_LEM_FIR_OR_MASK:
+        phb->regs[PHB_LEM_FIR_ACCUM >> 3] |= val;
+        return;
+    case PHB_LEM_ERROR_AND_MASK:
+        phb->regs[PHB_LEM_ERROR_MASK >> 3] &= val;
+        return;
+    case PHB_LEM_ERROR_OR_MASK:
+        phb->regs[PHB_LEM_ERROR_MASK >> 3] |= val;
+        return;
+    case PHB_LEM_WOF:
+        val = 0;
+        break;
+    }
+
+    /* Record whether it changed */
+    changed = phb->regs[off >> 3] != val;
+
+    /* Store in register cache first */
+    phb->regs[off >> 3] = val;
+
+    /* Handle side effects */
+    switch (off) {
+    case PHB_PHB3_CONFIG:
+        if (changed) {
+            pnv_phb3_update_all_msi_regions(phb);
+        }
+        /* fall through */
+    case PHB_M32_BASE_ADDR:
+    case PHB_M32_BASE_MASK:
+    case PHB_M32_START_ADDR:
+        if (changed) {
+            pnv_phb3_check_m32(phb);
+        }
+        break;
+    case PHB_M64_UPPER_BITS:
+        if (changed) {
+            pnv_phb3_check_all_m64s(phb);
+        }
+        break;
+    case PHB_LSI_SOURCE_ID:
+        if (changed) {
+            pnv_phb3_lsi_src_id_write(phb, val);
+        }
+        break;
+
+    /* IODA table accesses */
+    case PHB_IODA_DATA0:
+        pnv_phb3_ioda_write(phb, val);
+        break;
+
+    /* RTC invalidation */
+    case PHB_RTC_INVALIDATE:
+        pnv_phb3_rtc_invalidate(phb, val);
+        break;
+
+    /* FFI request */
+    case PHB_FFI_REQUEST:
+        pnv_phb3_msi_ffi(&phb->msis, val);
+        break;
+
+    /* Silent simple writes */
+    case PHB_CONFIG_ADDRESS:
+    case PHB_IODA_ADDR:
+    case PHB_TCE_KILL:
+    case PHB_TCE_SPEC_CTL:
+    case PHB_PEST_BAR:
+    case PHB_PELTV_BAR:
+    case PHB_RTT_BAR:
+    case PHB_RBA_BAR:
+    case PHB_IVT_BAR:
+    case PHB_FFI_LOCK:
+    case PHB_LEM_FIR_ACCUM:
+    case PHB_LEM_ERROR_MASK:
+    case PHB_LEM_ACTION0:
+    case PHB_LEM_ACTION1:
+        break;
+
+    /* Noise on anything else */
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb3: reg_write 0x%"PRIx64"=%"PRIx64"\n",
+                      off, val);
+    }
+}
+
+uint64_t pnv_phb3_reg_read(void *opaque, hwaddr off, unsigned size)
+{
+    PnvPHB3 *phb = opaque;
+    PCIHostState *pci = PCI_HOST_BRIDGE(phb);
+    uint64_t val;
+
+    if ((off & 0xfffc) == PHB_CONFIG_DATA) {
+        return pnv_phb3_config_read(phb, off & 0x3, size);
+    }
+
+    /* Other registers are 64-bit only */
+    if (size != 8 || off & 0x7) {
+        phb3_error(phb, "Invalid register access, offset: 0x%"PRIx64" size: %d",
+                   off, size);
+        return ~0ull;
+    }
+
+    /* Default read from cache */
+    val = phb->regs[off >> 3];
+
+    switch (off) {
+    /* Simulate venice DD2.0 */
+    case PHB_VERSION:
+        return 0x000000a300000005ull;
+    case PHB_PCIE_SYSTEM_CONFIG:
+        return 0x441100fc30000000;
+
+    /* IODA table accesses */
+    case PHB_IODA_DATA0:
+        return pnv_phb3_ioda_read(phb);
+
+    /* Link training always appears trained */
+    case PHB_PCIE_DLP_TRAIN_CTL:
+        if (!pci_find_device(pci->bus, 1, 0)) {
+            return 0;
+        }
+        return PHB_PCIE_DLP_INBAND_PRESENCE | PHB_PCIE_DLP_TC_DL_LINKACT;
+
+    /* FFI Lock */
+    case PHB_FFI_LOCK:
+        /* Set lock and return previous value */
+        phb->regs[off >> 3] |= PHB_FFI_LOCK_STATE;
+        return val;
+
+    /* DMA read sync: make it look like it's complete */
+    case PHB_DMARD_SYNC:
+        return PHB_DMARD_SYNC_COMPLETE;
+
+    /* Silent simple reads */
+    case PHB_PHB3_CONFIG:
+    case PHB_M32_BASE_ADDR:
+    case PHB_M32_BASE_MASK:
+    case PHB_M32_START_ADDR:
+    case PHB_CONFIG_ADDRESS:
+    case PHB_IODA_ADDR:
+    case PHB_RTC_INVALIDATE:
+    case PHB_TCE_KILL:
+    case PHB_TCE_SPEC_CTL:
+    case PHB_PEST_BAR:
+    case PHB_PELTV_BAR:
+    case PHB_RTT_BAR:
+    case PHB_RBA_BAR:
+    case PHB_IVT_BAR:
+    case PHB_M64_UPPER_BITS:
+    case PHB_LEM_FIR_ACCUM:
+    case PHB_LEM_ERROR_MASK:
+    case PHB_LEM_ACTION0:
+    case PHB_LEM_ACTION1:
+        break;
+
+    /* Noise on anything else */
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb3: reg_read 0x%"PRIx64"=%"PRIx64"\n",
+                      off, val);
+    }
+    return val;
+}
+
+static const MemoryRegionOps pnv_phb3_reg_ops = {
+    .read = pnv_phb3_reg_read,
+    .write = pnv_phb3_reg_write,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static int pnv_phb3_map_irq(PCIDevice *pci_dev, int irq_num)
+{
+    /* Check that out properly ... */
+    return irq_num & 3;
+}
+
+static void pnv_phb3_set_irq(void *opaque, int irq_num, int level)
+{
+    PnvPHB3 *phb = opaque;
+
+    /* LSI only ... */
+    if (irq_num > 3) {
+        phb3_error(phb, "Unknown IRQ to set %d", irq_num);
+    }
+    qemu_set_irq(phb->qirqs[irq_num], level);
+}
+
+static bool pnv_phb3_resolve_pe(PnvPhb3DMASpace *ds)
+{
+    uint64_t rtt, addr;
+    uint16_t rte;
+    int bus_num;
+
+    /* Already resolved ? */
+    if (ds->pe_num != PHB_INVALID_PE) {
+        return true;
+    }
+
+    /* We need to lookup the RTT */
+    rtt = ds->phb->regs[PHB_RTT_BAR >> 3];
+    if (!(rtt & PHB_RTT_BAR_ENABLE)) {
+        phb3_error(ds->phb, "DMA with RTT BAR disabled !");
+        /* Set error bits ? fence ? ... */
+        return false;
+    }
+
+    /* Read RTE */
+    bus_num = pci_bus_num(ds->bus);
+    addr = rtt & PHB_RTT_BASE_ADDRESS_MASK;
+    addr += 2 * ((bus_num << 8) | ds->devfn);
+    if (dma_memory_read(&address_space_memory, addr, &rte, sizeof(rte))) {
+        phb3_error(ds->phb, "Failed to read RTT entry at 0x%"PRIx64, addr);
+        /* Set error bits ? fence ? ... */
+        return false;
+    }
+    rte = be16_to_cpu(rte);
+
+    /* Fail upon reading of invalid PE# */
+    if (rte >= PNV_PHB3_NUM_PE) {
+        phb3_error(ds->phb, "RTE for RID 0x%x invalid (%04x", ds->devfn, rte);
+        /* Set error bits ? fence ? ... */
+        return false;
+    }
+    ds->pe_num = rte;
+    return true;
+}
+
+static void pnv_phb3_translate_tve(PnvPhb3DMASpace *ds, hwaddr addr,
+                                   bool is_write, uint64_t tve,
+                                   IOMMUTLBEntry *tlb)
+{
+    uint64_t tta = GETFIELD(IODA2_TVT_TABLE_ADDR, tve);
+    int32_t  lev = GETFIELD(IODA2_TVT_NUM_LEVELS, tve);
+    uint32_t tts = GETFIELD(IODA2_TVT_TCE_TABLE_SIZE, tve);
+    uint32_t tps = GETFIELD(IODA2_TVT_IO_PSIZE, tve);
+    PnvPHB3 *phb = ds->phb;
+
+    /* Invalid levels */
+    if (lev > 4) {
+        phb3_error(phb, "Invalid #levels in TVE %d", lev);
+        return;
+    }
+
+    /* IO Page Size of 0 means untranslated, else use TCEs */
+    if (tps == 0) {
+        /*
+         * We only support non-translate in top window.
+         *
+         * TODO: Venice/Murano support it on bottom window above 4G and
+         * Naples suports it on everything
+         */
+        if (!(tve & PPC_BIT(51))) {
+            phb3_error(phb, "xlate for invalid non-translate TVE");
+            return;
+        }
+        /* TODO: Handle boundaries */
+
+        /* Use 4k pages like q35 ... for now */
+        tlb->iova = addr & 0xfffffffffffff000ull;
+        tlb->translated_addr = addr & 0x0003fffffffff000ull;
+        tlb->addr_mask = 0xfffull;
+        tlb->perm = IOMMU_RW;
+    } else {
+        uint32_t tce_shift, tbl_shift, sh;
+        uint64_t base, taddr, tce, tce_mask;
+
+        /* TVE disabled ? */
+        if (tts == 0) {
+            phb3_error(phb, "xlate for invalid translated TVE");
+            return;
+        }
+
+        /* Address bits per bottom level TCE entry */
+        tce_shift = tps + 11;
+
+        /* Address bits per table level */
+        tbl_shift = tts + 8;
+
+        /* Top level table base address */
+        base = tta << 12;
+
+        /* Total shift to first level */
+        sh = tbl_shift * lev + tce_shift;
+
+        /* TODO: Multi-level untested */
+        while ((lev--) >= 0) {
+            /* Grab the TCE address */
+            taddr = base | (((addr >> sh) & ((1ul << tbl_shift) - 1)) << 3);
+            if (dma_memory_read(&address_space_memory, taddr, &tce,
+                                sizeof(tce))) {
+                phb3_error(phb, "Failed to read TCE at 0x%"PRIx64, taddr);
+                return;
+            }
+            tce = be64_to_cpu(tce);
+
+            /* Check permission for indirect TCE */
+            if ((lev >= 0) && !(tce & 3)) {
+                phb3_error(phb, "Invalid indirect TCE at 0x%"PRIx64, taddr);
+                phb3_error(phb, " xlate %"PRIx64":%c TVE=%"PRIx64, addr,
+                           is_write ? 'W' : 'R', tve);
+                phb3_error(phb, " tta=%"PRIx64" lev=%d tts=%d tps=%d",
+                           tta, lev, tts, tps);
+                return;
+            }
+            sh -= tbl_shift;
+            base = tce & ~0xfffull;
+        }
+
+        /* We exit the loop with TCE being the final TCE */
+        tce_mask = ~((1ull << tce_shift) - 1);
+        tlb->iova = addr & tce_mask;
+        tlb->translated_addr = tce & tce_mask;
+        tlb->addr_mask = ~tce_mask;
+        tlb->perm = tce & 3;
+        if ((is_write & !(tce & 2)) || ((!is_write) && !(tce & 1))) {
+            phb3_error(phb, "TCE access fault at 0x%"PRIx64, taddr);
+            phb3_error(phb, " xlate %"PRIx64":%c TVE=%"PRIx64, addr,
+                       is_write ? 'W' : 'R', tve);
+            phb3_error(phb, " tta=%"PRIx64" lev=%d tts=%d tps=%d",
+                       tta, lev, tts, tps);
+        }
+    }
+}
+
+static IOMMUTLBEntry pnv_phb3_translate_iommu(IOMMUMemoryRegion *iommu,
+                                              hwaddr addr,
+                                              IOMMUAccessFlags flag,
+                                              int iommu_idx)
+{
+    PnvPhb3DMASpace *ds = container_of(iommu, PnvPhb3DMASpace, dma_mr);
+    int tve_sel;
+    uint64_t tve, cfg;
+    IOMMUTLBEntry ret = {
+        .target_as = &address_space_memory,
+        .iova = addr,
+        .translated_addr = 0,
+        .addr_mask = ~(hwaddr)0,
+        .perm = IOMMU_NONE,
+    };
+    PnvPHB3 *phb = ds->phb;
+
+    /* Resolve PE# */
+    if (!pnv_phb3_resolve_pe(ds)) {
+        phb3_error(phb, "Failed to resolve PE# for bus @%p (%d) devfn 0x%x",
+                   ds->bus, pci_bus_num(ds->bus), ds->devfn);
+        return ret;
+    }
+
+    /* Check top bits */
+    switch (addr >> 60) {
+    case 00:
+        /* DMA or 32-bit MSI ? */
+        cfg = ds->phb->regs[PHB_PHB3_CONFIG >> 3];
+        if ((cfg & PHB_PHB3C_32BIT_MSI_EN) &&
+            ((addr & 0xffffffffffff0000ull) == 0xffff0000ull)) {
+            phb3_error(phb, "xlate on 32-bit MSI region");
+            return ret;
+        }
+        /* Choose TVE XXX Use PHB3 Control Register */
+        tve_sel = (addr >> 59) & 1;
+        tve = ds->phb->ioda_TVT[ds->pe_num * 2 + tve_sel];
+        pnv_phb3_translate_tve(ds, addr, flag & IOMMU_WO, tve, &ret);
+        break;
+    case 01:
+        phb3_error(phb, "xlate on 64-bit MSI region");
+        break;
+    default:
+        phb3_error(phb, "xlate on unsupported address 0x%"PRIx64, addr);
+    }
+    return ret;
+}
+
+#define TYPE_PNV_PHB3_IOMMU_MEMORY_REGION "pnv-phb3-iommu-memory-region"
+#define PNV_PHB3_IOMMU_MEMORY_REGION(obj) \
+    OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_PNV_PHB3_IOMMU_MEMORY_REGION)
+
+static void pnv_phb3_iommu_memory_region_class_init(ObjectClass *klass,
+                                                    void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = pnv_phb3_translate_iommu;
+}
+
+static const TypeInfo pnv_phb3_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_PNV_PHB3_IOMMU_MEMORY_REGION,
+    .class_init = pnv_phb3_iommu_memory_region_class_init,
+};
+
+/*
+ * MSI/MSIX memory region implementation.
+ * The handler handles both MSI and MSIX.
+ */
+static void pnv_phb3_msi_write(void *opaque, hwaddr addr,
+                               uint64_t data, unsigned size)
+{
+    PnvPhb3DMASpace *ds = opaque;
+
+    /* Resolve PE# */
+    if (!pnv_phb3_resolve_pe(ds)) {
+        phb3_error(ds->phb, "Failed to resolve PE# for bus @%p (%d) devfn 0x%x",
+                   ds->bus, pci_bus_num(ds->bus), ds->devfn);
+        return;
+    }
+
+    pnv_phb3_msi_send(&ds->phb->msis, addr, data, ds->pe_num);
+}
+
+/* There is no .read as the read result is undefined by PCI spec */
+static uint64_t pnv_phb3_msi_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PnvPhb3DMASpace *ds = opaque;
+
+    phb3_error(ds->phb, "invalid read @ 0x%" HWADDR_PRIx, addr);
+    return -1;
+}
+
+static const MemoryRegionOps pnv_phb3_msi_ops = {
+    .read = pnv_phb3_msi_read,
+    .write = pnv_phb3_msi_write,
+    .endianness = DEVICE_LITTLE_ENDIAN
+};
+
+static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+    PnvPHB3 *phb = opaque;
+    PnvPhb3DMASpace *ds;
+
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        if (ds->bus == bus && ds->devfn == devfn) {
+            break;
+        }
+    }
+
+    if (ds == NULL) {
+        ds = g_malloc0(sizeof(PnvPhb3DMASpace));
+        ds->bus = bus;
+        ds->devfn = devfn;
+        ds->pe_num = PHB_INVALID_PE;
+        ds->phb = phb;
+        memory_region_init_iommu(&ds->dma_mr, sizeof(ds->dma_mr),
+                                 TYPE_PNV_PHB3_IOMMU_MEMORY_REGION,
+                                 OBJECT(phb), "phb3_iommu", UINT64_MAX);
+        address_space_init(&ds->dma_as, MEMORY_REGION(&ds->dma_mr),
+                           "phb3_iommu");
+        memory_region_init_io(&ds->msi32_mr, OBJECT(phb), &pnv_phb3_msi_ops,
+                              ds, "msi32", 0x10000);
+        memory_region_init_io(&ds->msi64_mr, OBJECT(phb), &pnv_phb3_msi_ops,
+                              ds, "msi64", 0x100000);
+        pnv_phb3_update_msi_regions(ds);
+
+        QLIST_INSERT_HEAD(&phb->dma_spaces, ds, list);
+    }
+    return &ds->dma_as;
+}
+
+static void pnv_phb3_instance_init(Object *obj)
+{
+    PnvPHB3 *phb = PNV_PHB3(obj);
+
+    QLIST_INIT(&phb->dma_spaces);
+
+    /* LSI sources */
+    object_initialize_child(obj, "lsi", &phb->lsis, sizeof(phb->lsis),
+                             TYPE_ICS, &error_abort, NULL);
+
+    /* Default init ... will be fixed by HW inits */
+    phb->lsis.offset = 0;
+
+    /* MSI sources */
+    object_initialize_child(obj, "msi", &phb->msis, sizeof(phb->msis),
+                            TYPE_PHB3_MSI, &error_abort, NULL);
+
+    /* Power Bus Common Queue */
+    object_initialize_child(obj, "pbcq", &phb->pbcq, sizeof(phb->pbcq),
+                            TYPE_PNV_PBCQ, &error_abort, NULL);
+
+    /* Root Port */
+    object_initialize_child(obj, "root", &phb->root, sizeof(phb->root),
+                            TYPE_PNV_PHB3_ROOT_PORT, &error_abort, NULL);
+    qdev_prop_set_int32(DEVICE(&phb->root), "addr", PCI_DEVFN(0, 0));
+    qdev_prop_set_bit(DEVICE(&phb->root), "multifunction", false);
+}
+
+static void pnv_phb3_realize(DeviceState *dev, Error **errp)
+{
+    PnvPHB3 *phb = PNV_PHB3(dev);
+    PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+    PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
+    Error *local_err = NULL;
+    int i;
+
+    if (phb->phb_id >= PNV8_CHIP_PHB3_MAX) {
+        error_setg(errp, "invalid PHB index: %d", phb->phb_id);
+        return;
+    }
+
+    /* LSI sources */
+    object_property_set_link(OBJECT(&phb->lsis), OBJECT(pnv), "xics",
+                                   &error_abort);
+    object_property_set_int(OBJECT(&phb->lsis), PNV_PHB3_NUM_LSI, "nr-irqs",
+                            &error_abort);
+    object_property_set_bool(OBJECT(&phb->lsis), true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    for (i = 0; i < phb->lsis.nr_irqs; i++) {
+        ics_set_irq_type(&phb->lsis, i, true);
+    }
+
+    phb->qirqs = qemu_allocate_irqs(ics_set_irq, &phb->lsis, phb->lsis.nr_irqs);
+
+    /* MSI sources */
+    object_property_set_link(OBJECT(&phb->msis), OBJECT(phb), "phb",
+                                   &error_abort);
+    object_property_set_link(OBJECT(&phb->msis), OBJECT(pnv), "xics",
+                                   &error_abort);
+    object_property_set_int(OBJECT(&phb->msis), PHB3_MAX_MSI, "nr-irqs",
+                            &error_abort);
+    object_property_set_bool(OBJECT(&phb->msis), true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Power Bus Common Queue */
+    object_property_set_link(OBJECT(&phb->pbcq), OBJECT(phb), "phb",
+                                   &error_abort);
+    object_property_set_bool(OBJECT(&phb->pbcq), true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Controller Registers */
+    memory_region_init_io(&phb->mr_regs, OBJECT(phb), &pnv_phb3_reg_ops, phb,
+                          "phb3-regs", 0x1000);
+
+    /*
+     * PHB3 doesn't support IO space. However, qemu gets very upset if
+     * we don't have an IO region to anchor IO BARs onto so we just
+     * initialize one which we never hook up to anything
+     */
+    memory_region_init(&phb->pci_io, OBJECT(phb), "pci-io", 0x10000);
+    memory_region_init(&phb->pci_mmio, OBJECT(phb), "pci-mmio",
+                       PCI_MMIO_TOTAL_SIZE);
+
+    pci->bus = pci_register_root_bus(dev, "root-bus",
+                                     pnv_phb3_set_irq, pnv_phb3_map_irq, phb,
+                                     &phb->pci_mmio, &phb->pci_io,
+                                     0, 4, TYPE_PNV_PHB3_ROOT_BUS);
+
+    pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb);
+
+    /* Add a single Root port */
+    qdev_prop_set_uint8(DEVICE(&phb->root), "chassis", phb->chip_id);
+    qdev_prop_set_uint16(DEVICE(&phb->root), "slot", phb->phb_id);
+    qdev_set_parent_bus(DEVICE(&phb->root), BUS(pci->bus));
+    qdev_init_nofail(DEVICE(&phb->root));
+}
+
+void pnv_phb3_update_regions(PnvPHB3 *phb)
+{
+    PnvPBCQState *pbcq = &phb->pbcq;
+
+    /* Unmap first always */
+    if (memory_region_is_mapped(&phb->mr_regs)) {
+        memory_region_del_subregion(&pbcq->phbbar, &phb->mr_regs);
+    }
+
+    /* Map registers if enabled */
+    if (memory_region_is_mapped(&pbcq->phbbar)) {
+        /* TODO: We should use the PHB BAR 2 register but we don't ... */
+        memory_region_add_subregion(&pbcq->phbbar, 0, &phb->mr_regs);
+    }
+
+    /* Check/update m32 */
+    if (memory_region_is_mapped(&phb->mr_m32)) {
+        pnv_phb3_check_m32(phb);
+    }
+    pnv_phb3_check_all_m64s(phb);
+}
+
+static const char *pnv_phb3_root_bus_path(PCIHostState *host_bridge,
+                                          PCIBus *rootbus)
+{
+    PnvPHB3 *phb = PNV_PHB3(host_bridge);
+
+    snprintf(phb->bus_path, sizeof(phb->bus_path), "00%02x:%02x",
+             phb->chip_id, phb->phb_id);
+    return phb->bus_path;
+}
+
+static Property pnv_phb3_properties[] = {
+        DEFINE_PROP_UINT32("index", PnvPHB3, phb_id, 0),
+        DEFINE_PROP_UINT32("chip-id", PnvPHB3, chip_id, 0),
+        DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pnv_phb3_class_init(ObjectClass *klass, void *data)
+{
+    PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    hc->root_bus_path = pnv_phb3_root_bus_path;
+    dc->realize = pnv_phb3_realize;
+    device_class_set_props(dc, pnv_phb3_properties);
+    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+    dc->user_creatable = false;
+}
+
+static const TypeInfo pnv_phb3_type_info = {
+    .name          = TYPE_PNV_PHB3,
+    .parent        = TYPE_PCIE_HOST_BRIDGE,
+    .instance_size = sizeof(PnvPHB3),
+    .class_init    = pnv_phb3_class_init,
+    .instance_init = pnv_phb3_instance_init,
+};
+
+static void pnv_phb3_root_bus_class_init(ObjectClass *klass, void *data)
+{
+    BusClass *k = BUS_CLASS(klass);
+
+    /*
+     * PHB3 has only a single root complex. Enforce the limit on the
+     * parent bus
+     */
+    k->max_dev = 1;
+}
+
+static const TypeInfo pnv_phb3_root_bus_info = {
+    .name = TYPE_PNV_PHB3_ROOT_BUS,
+    .parent = TYPE_PCIE_BUS,
+    .class_init = pnv_phb3_root_bus_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { }
+    },
+};
+
+static void pnv_phb3_root_port_realize(DeviceState *dev, Error **errp)
+{
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    rpc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void pnv_phb3_root_port_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
+
+    dc->desc     = "IBM PHB3 PCIE Root Port";
+
+    device_class_set_parent_realize(dc, pnv_phb3_root_port_realize,
+                                    &rpc->parent_realize);
+    dc->user_creatable = false;
+
+    k->vendor_id = PCI_VENDOR_ID_IBM;
+    k->device_id = 0x03dc;
+    k->revision  = 0;
+
+    rpc->exp_offset = 0x48;
+    rpc->aer_offset = 0x100;
+}
+
+static const TypeInfo pnv_phb3_root_port_info = {
+    .name          = TYPE_PNV_PHB3_ROOT_PORT,
+    .parent        = TYPE_PCIE_ROOT_PORT,
+    .instance_size = sizeof(PnvPHB3RootPort),
+    .class_init    = pnv_phb3_root_port_class_init,
+};
+
+static void pnv_phb3_register_types(void)
+{
+    type_register_static(&pnv_phb3_root_bus_info);
+    type_register_static(&pnv_phb3_root_port_info);
+    type_register_static(&pnv_phb3_type_info);
+    type_register_static(&pnv_phb3_iommu_memory_region_info);
+}
+
+type_init(pnv_phb3_register_types)
diff --git a/hw/pci-host/pnv_phb3_msi.c b/hw/pci-host/pnv_phb3_msi.c
new file mode 100644
index 0000000..ecfc1b2
--- /dev/null
+++ b/hw/pci-host/pnv_phb3_msi.c
@@ -0,0 +1,349 @@
+/*
+ * QEMU PowerPC PowerNV (POWER8) PHB3 model
+ *
+ * Copyright (c) 2014-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/pci-host/pnv_phb3_regs.h"
+#include "hw/pci-host/pnv_phb3.h"
+#include "hw/ppc/pnv.h"
+#include "hw/pci/msi.h"
+#include "monitor/monitor.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+#include "sysemu/reset.h"
+
+static uint64_t phb3_msi_ive_addr(PnvPHB3 *phb, int srcno)
+{
+    uint64_t ivtbar = phb->regs[PHB_IVT_BAR >> 3];
+    uint64_t phbctl = phb->regs[PHB_CONTROL >> 3];
+
+    if (!(ivtbar & PHB_IVT_BAR_ENABLE)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Failed access to disable IVT BAR !");
+        return 0;
+    }
+
+    if (srcno >= (ivtbar & PHB_IVT_LENGTH_MASK)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "MSI out of bounds (%d vs  0x%"PRIx64")",
+                      srcno, (uint64_t) (ivtbar & PHB_IVT_LENGTH_MASK));
+        return 0;
+    }
+
+    ivtbar &= PHB_IVT_BASE_ADDRESS_MASK;
+
+    if (phbctl & PHB_CTRL_IVE_128_BYTES) {
+        return ivtbar + 128 * srcno;
+    } else {
+        return ivtbar + 16 * srcno;
+    }
+}
+
+static bool phb3_msi_read_ive(PnvPHB3 *phb, int srcno, uint64_t *out_ive)
+{
+    uint64_t ive_addr, ive;
+
+    ive_addr = phb3_msi_ive_addr(phb, srcno);
+    if (!ive_addr) {
+        return false;
+    }
+
+    if (dma_memory_read(&address_space_memory, ive_addr, &ive, sizeof(ive))) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Failed to read IVE at 0x%" PRIx64,
+                      ive_addr);
+        return false;
+    }
+    *out_ive = be64_to_cpu(ive);
+
+    return true;
+}
+
+static void phb3_msi_set_p(Phb3MsiState *msi, int srcno, uint8_t gen)
+{
+    uint64_t ive_addr;
+    uint8_t p = 0x01 | (gen << 1);
+
+    ive_addr = phb3_msi_ive_addr(msi->phb, srcno);
+    if (!ive_addr) {
+        return;
+    }
+
+    if (dma_memory_write(&address_space_memory, ive_addr + 4, &p, 1)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Failed to write IVE (set P) at 0x%" PRIx64, ive_addr);
+    }
+}
+
+static void phb3_msi_set_q(Phb3MsiState *msi, int srcno)
+{
+    uint64_t ive_addr;
+    uint8_t q = 0x01;
+
+    ive_addr = phb3_msi_ive_addr(msi->phb, srcno);
+    if (!ive_addr) {
+        return;
+    }
+
+    if (dma_memory_write(&address_space_memory, ive_addr + 5, &q, 1)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Failed to write IVE (set Q) at 0x%" PRIx64, ive_addr);
+    }
+}
+
+static void phb3_msi_try_send(Phb3MsiState *msi, int srcno, bool force)
+{
+    ICSState *ics = ICS(msi);
+    uint64_t ive;
+    uint64_t server, prio, pq, gen;
+
+    if (!phb3_msi_read_ive(msi->phb, srcno, &ive)) {
+        return;
+    }
+
+    server = GETFIELD(IODA2_IVT_SERVER, ive);
+    prio = GETFIELD(IODA2_IVT_PRIORITY, ive);
+    if (!force) {
+        pq = GETFIELD(IODA2_IVT_Q, ive) | (GETFIELD(IODA2_IVT_P, ive) << 1);
+    } else {
+        pq = 0;
+    }
+    gen = GETFIELD(IODA2_IVT_GEN, ive);
+
+    /*
+     * The low order 2 bits are the link pointer (Type II interrupts).
+     * Shift back to get a valid IRQ server.
+     */
+    server >>= 2;
+
+    switch (pq) {
+    case 0: /* 00 */
+        if (prio == 0xff) {
+            /* Masked, set Q */
+            phb3_msi_set_q(msi, srcno);
+        } else {
+            /* Enabled, set P and send */
+            phb3_msi_set_p(msi, srcno, gen);
+            icp_irq(ics, server, srcno + ics->offset, prio);
+        }
+        break;
+    case 2: /* 10 */
+        /* Already pending, set Q */
+        phb3_msi_set_q(msi, srcno);
+        break;
+    case 1: /* 01 */
+    case 3: /* 11 */
+    default:
+        /* Just drop stuff if Q already set */
+        break;
+    }
+}
+
+static void phb3_msi_set_irq(void *opaque, int srcno, int val)
+{
+    Phb3MsiState *msi = PHB3_MSI(opaque);
+
+    if (val) {
+        phb3_msi_try_send(msi, srcno, false);
+    }
+}
+
+
+void pnv_phb3_msi_send(Phb3MsiState *msi, uint64_t addr, uint16_t data,
+                       int32_t dev_pe)
+{
+    ICSState *ics = ICS(msi);
+    uint64_t ive;
+    uint16_t pe;
+    uint32_t src = ((addr >> 4) & 0xffff) | (data & 0x1f);
+
+    if (src >= ics->nr_irqs) {
+        qemu_log_mask(LOG_GUEST_ERROR, "MSI %d out of bounds", src);
+        return;
+    }
+    if (dev_pe >= 0) {
+        if (!phb3_msi_read_ive(msi->phb, src, &ive)) {
+            return;
+        }
+        pe = GETFIELD(IODA2_IVT_PE, ive);
+        if (pe != dev_pe) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "MSI %d send by PE#%d but assigned to PE#%d",
+                          src, dev_pe, pe);
+            return;
+        }
+    }
+    qemu_irq_pulse(msi->qirqs[src]);
+}
+
+void pnv_phb3_msi_ffi(Phb3MsiState *msi, uint64_t val)
+{
+    /* Emit interrupt */
+    pnv_phb3_msi_send(msi, val, 0, -1);
+
+    /* Clear FFI lock */
+    msi->phb->regs[PHB_FFI_LOCK >> 3] = 0;
+}
+
+static void phb3_msi_reject(ICSState *ics, uint32_t nr)
+{
+    Phb3MsiState *msi = PHB3_MSI(ics);
+    unsigned int srcno = nr - ics->offset;
+    unsigned int idx = srcno >> 6;
+    unsigned int bit = 1ull << (srcno & 0x3f);
+
+    assert(srcno < PHB3_MAX_MSI);
+
+    msi->rba[idx] |= bit;
+    msi->rba_sum |= (1u << idx);
+}
+
+static void phb3_msi_resend(ICSState *ics)
+{
+    Phb3MsiState *msi = PHB3_MSI(ics);
+    unsigned int i, j;
+
+    if (msi->rba_sum == 0) {
+        return;
+    }
+
+    for (i = 0; i < 32; i++) {
+        if ((msi->rba_sum & (1u << i)) == 0) {
+            continue;
+        }
+        msi->rba_sum &= ~(1u << i);
+        for (j = 0; j < 64; j++) {
+            if ((msi->rba[i] & (1ull << j)) == 0) {
+                continue;
+            }
+            msi->rba[i] &= ~(1u << j);
+            phb3_msi_try_send(msi, i * 64 + j, true);
+        }
+    }
+}
+
+static void phb3_msi_reset(DeviceState *dev)
+{
+    Phb3MsiState *msi = PHB3_MSI(dev);
+    ICSStateClass *icsc = ICS_GET_CLASS(dev);
+
+    icsc->parent_reset(dev);
+
+    memset(msi->rba, 0, sizeof(msi->rba));
+    msi->rba_sum = 0;
+}
+
+static void phb3_msi_reset_handler(void *dev)
+{
+    phb3_msi_reset(dev);
+}
+
+void pnv_phb3_msi_update_config(Phb3MsiState *msi, uint32_t base,
+                                uint32_t count)
+{
+    ICSState *ics = ICS(msi);
+
+    if (count > PHB3_MAX_MSI) {
+        count = PHB3_MAX_MSI;
+    }
+    ics->nr_irqs = count;
+    ics->offset = base;
+}
+
+static void phb3_msi_realize(DeviceState *dev, Error **errp)
+{
+    Phb3MsiState *msi = PHB3_MSI(dev);
+    ICSState *ics = ICS(msi);
+    ICSStateClass *icsc = ICS_GET_CLASS(ics);
+    Error *local_err = NULL;
+
+    assert(msi->phb);
+
+    icsc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    msi->qirqs = qemu_allocate_irqs(phb3_msi_set_irq, msi, ics->nr_irqs);
+
+    qemu_register_reset(phb3_msi_reset_handler, dev);
+}
+
+static void phb3_msi_instance_init(Object *obj)
+{
+    Phb3MsiState *msi = PHB3_MSI(obj);
+    ICSState *ics = ICS(obj);
+
+    object_property_add_link(obj, "phb", TYPE_PNV_PHB3,
+                             (Object **)&msi->phb,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_STRONG,
+                             &error_abort);
+
+    /* Will be overriden later */
+    ics->offset = 0;
+}
+
+static void phb3_msi_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ICSStateClass *isc = ICS_CLASS(klass);
+
+    device_class_set_parent_realize(dc, phb3_msi_realize,
+                                    &isc->parent_realize);
+    device_class_set_parent_reset(dc, phb3_msi_reset,
+                                  &isc->parent_reset);
+
+    isc->reject = phb3_msi_reject;
+    isc->resend = phb3_msi_resend;
+}
+
+static const TypeInfo phb3_msi_info = {
+    .name = TYPE_PHB3_MSI,
+    .parent = TYPE_ICS,
+    .instance_size = sizeof(Phb3MsiState),
+    .class_init = phb3_msi_class_init,
+    .class_size = sizeof(ICSStateClass),
+    .instance_init = phb3_msi_instance_init,
+};
+
+static void pnv_phb3_msi_register_types(void)
+{
+    type_register_static(&phb3_msi_info);
+}
+
+type_init(pnv_phb3_msi_register_types);
+
+void pnv_phb3_msi_pic_print_info(Phb3MsiState *msi, Monitor *mon)
+{
+    ICSState *ics = ICS(msi);
+    int i;
+
+    monitor_printf(mon, "ICS %4x..%4x %p\n",
+                   ics->offset, ics->offset + ics->nr_irqs - 1, ics);
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        uint64_t ive;
+
+        if (!phb3_msi_read_ive(msi->phb, i, &ive)) {
+            return;
+        }
+
+        if (GETFIELD(IODA2_IVT_PRIORITY, ive) == 0xff) {
+            continue;
+        }
+
+        monitor_printf(mon, "  %4x %c%c server=%04x prio=%02x gen=%d\n",
+                       ics->offset + i,
+                       GETFIELD(IODA2_IVT_P, ive) ? 'P' : '-',
+                       GETFIELD(IODA2_IVT_Q, ive) ? 'Q' : '-',
+                       (uint32_t) GETFIELD(IODA2_IVT_SERVER, ive) >> 2,
+                       (uint32_t) GETFIELD(IODA2_IVT_PRIORITY, ive),
+                       (uint32_t) GETFIELD(IODA2_IVT_GEN, ive));
+    }
+}
diff --git a/hw/pci-host/pnv_phb3_pbcq.c b/hw/pci-host/pnv_phb3_pbcq.c
new file mode 100644
index 0000000..f232228
--- /dev/null
+++ b/hw/pci-host/pnv_phb3_pbcq.c
@@ -0,0 +1,358 @@
+/*
+ * QEMU PowerPC PowerNV (POWER8) PHB3 model
+ *
+ * Copyright (c) 2014-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/log.h"
+#include "target/ppc/cpu.h"
+#include "hw/ppc/fdt.h"
+#include "hw/pci-host/pnv_phb3_regs.h"
+#include "hw/pci-host/pnv_phb3.h"
+#include "hw/ppc/pnv.h"
+#include "hw/ppc/pnv_xscom.h"
+#include "hw/pci/pci_bridge.h"
+#include "hw/pci/pci_bus.h"
+
+#include <libfdt.h>
+
+#define phb3_pbcq_error(pbcq, fmt, ...)                                 \
+    qemu_log_mask(LOG_GUEST_ERROR, "phb3_pbcq[%d:%d]: " fmt "\n",       \
+                  (pbcq)->phb->chip_id, (pbcq)->phb->phb_id, ## __VA_ARGS__)
+
+static uint64_t pnv_pbcq_nest_xscom_read(void *opaque, hwaddr addr,
+                                         unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t offset = addr >> 3;
+
+    return pbcq->nest_regs[offset];
+}
+
+static uint64_t pnv_pbcq_pci_xscom_read(void *opaque, hwaddr addr,
+                                        unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t offset = addr >> 3;
+
+    return pbcq->pci_regs[offset];
+}
+
+static uint64_t pnv_pbcq_spci_xscom_read(void *opaque, hwaddr addr,
+                                         unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t offset = addr >> 3;
+
+    if (offset == PBCQ_SPCI_ASB_DATA) {
+        return pnv_phb3_reg_read(pbcq->phb,
+                                 pbcq->spci_regs[PBCQ_SPCI_ASB_ADDR], 8);
+    }
+    return pbcq->spci_regs[offset];
+}
+
+static void pnv_pbcq_update_map(PnvPBCQState *pbcq)
+{
+    uint64_t bar_en = pbcq->nest_regs[PBCQ_NEST_BAR_EN];
+    uint64_t bar, mask, size;
+
+    /*
+     * NOTE: This will really not work well if those are remapped
+     * after the PHB has created its sub regions. We could do better
+     * if we had a way to resize regions but we don't really care
+     * that much in practice as the stuff below really only happens
+     * once early during boot
+     */
+
+    /* Handle unmaps */
+    if (memory_region_is_mapped(&pbcq->mmbar0) &&
+        !(bar_en & PBCQ_NEST_BAR_EN_MMIO0)) {
+        memory_region_del_subregion(get_system_memory(), &pbcq->mmbar0);
+    }
+    if (memory_region_is_mapped(&pbcq->mmbar1) &&
+        !(bar_en & PBCQ_NEST_BAR_EN_MMIO1)) {
+        memory_region_del_subregion(get_system_memory(), &pbcq->mmbar1);
+    }
+    if (memory_region_is_mapped(&pbcq->phbbar) &&
+        !(bar_en & PBCQ_NEST_BAR_EN_PHB)) {
+        memory_region_del_subregion(get_system_memory(), &pbcq->phbbar);
+    }
+
+    /* Update PHB */
+    pnv_phb3_update_regions(pbcq->phb);
+
+    /* Handle maps */
+    if (!memory_region_is_mapped(&pbcq->mmbar0) &&
+        (bar_en & PBCQ_NEST_BAR_EN_MMIO0)) {
+        bar = pbcq->nest_regs[PBCQ_NEST_MMIO_BAR0] >> 14;
+        mask = pbcq->nest_regs[PBCQ_NEST_MMIO_MASK0];
+        size = ((~mask) >> 14) + 1;
+        memory_region_init(&pbcq->mmbar0, OBJECT(pbcq), "pbcq-mmio0", size);
+        memory_region_add_subregion(get_system_memory(), bar, &pbcq->mmbar0);
+        pbcq->mmio0_base = bar;
+        pbcq->mmio0_size = size;
+    }
+    if (!memory_region_is_mapped(&pbcq->mmbar1) &&
+        (bar_en & PBCQ_NEST_BAR_EN_MMIO1)) {
+        bar = pbcq->nest_regs[PBCQ_NEST_MMIO_BAR1] >> 14;
+        mask = pbcq->nest_regs[PBCQ_NEST_MMIO_MASK1];
+        size = ((~mask) >> 14) + 1;
+        memory_region_init(&pbcq->mmbar1, OBJECT(pbcq), "pbcq-mmio1", size);
+        memory_region_add_subregion(get_system_memory(), bar, &pbcq->mmbar1);
+        pbcq->mmio1_base = bar;
+        pbcq->mmio1_size = size;
+    }
+    if (!memory_region_is_mapped(&pbcq->phbbar)
+        && (bar_en & PBCQ_NEST_BAR_EN_PHB)) {
+        bar = pbcq->nest_regs[PBCQ_NEST_PHB_BAR] >> 14;
+        size = 0x1000;
+        memory_region_init(&pbcq->phbbar, OBJECT(pbcq), "pbcq-phb", size);
+        memory_region_add_subregion(get_system_memory(), bar, &pbcq->phbbar);
+    }
+
+    /* Update PHB */
+    pnv_phb3_update_regions(pbcq->phb);
+}
+
+static void pnv_pbcq_nest_xscom_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PBCQ_NEST_MMIO_BAR0:
+    case PBCQ_NEST_MMIO_BAR1:
+    case PBCQ_NEST_MMIO_MASK0:
+    case PBCQ_NEST_MMIO_MASK1:
+        if (pbcq->nest_regs[PBCQ_NEST_BAR_EN] &
+            (PBCQ_NEST_BAR_EN_MMIO0 |
+             PBCQ_NEST_BAR_EN_MMIO1)) {
+            phb3_pbcq_error(pbcq, "Changing enabled BAR unsupported");
+        }
+        pbcq->nest_regs[reg] = val & 0xffffffffc0000000ull;
+        break;
+    case PBCQ_NEST_PHB_BAR:
+        if (pbcq->nest_regs[PBCQ_NEST_BAR_EN] & PBCQ_NEST_BAR_EN_PHB) {
+            phb3_pbcq_error(pbcq, "Changing enabled BAR unsupported");
+        }
+        pbcq->nest_regs[reg] = val & 0xfffffffffc000000ull;
+        break;
+    case PBCQ_NEST_BAR_EN:
+        pbcq->nest_regs[reg] = val & 0xf800000000000000ull;
+        pnv_pbcq_update_map(pbcq);
+        pnv_phb3_remap_irqs(pbcq->phb);
+        break;
+    case PBCQ_NEST_IRSN_COMPARE:
+    case PBCQ_NEST_IRSN_MASK:
+        pbcq->nest_regs[reg] = val & PBCQ_NEST_IRSN_COMP;
+        pnv_phb3_remap_irqs(pbcq->phb);
+        break;
+    case PBCQ_NEST_LSI_SRC_ID:
+        pbcq->nest_regs[reg] = val & PBCQ_NEST_LSI_SRC;
+        pnv_phb3_remap_irqs(pbcq->phb);
+        break;
+    default:
+        phb3_pbcq_error(pbcq, "%s @0x%"HWADDR_PRIx"=%"PRIx64, __func__,
+                        addr, val);
+    }
+}
+
+static void pnv_pbcq_pci_xscom_write(void *opaque, hwaddr addr,
+                                     uint64_t val, unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PBCQ_PCI_BAR2:
+        pbcq->pci_regs[reg] = val & 0xfffffffffc000000ull;
+        pnv_pbcq_update_map(pbcq);
+    default:
+        phb3_pbcq_error(pbcq, "%s @0x%"HWADDR_PRIx"=%"PRIx64, __func__,
+                        addr, val);
+    }
+}
+
+static void pnv_pbcq_spci_xscom_write(void *opaque, hwaddr addr,
+                                uint64_t val, unsigned size)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PBCQ_SPCI_ASB_ADDR:
+        pbcq->spci_regs[reg] = val & 0xfff;
+        break;
+    case PBCQ_SPCI_ASB_STATUS:
+        pbcq->spci_regs[reg] &= ~val;
+        break;
+    case PBCQ_SPCI_ASB_DATA:
+        pnv_phb3_reg_write(pbcq->phb, pbcq->spci_regs[PBCQ_SPCI_ASB_ADDR],
+                           val, 8);
+        break;
+    case PBCQ_SPCI_AIB_CAPP_EN:
+    case PBCQ_SPCI_CAPP_SEC_TMR:
+        break;
+    default:
+        phb3_pbcq_error(pbcq, "%s @0x%"HWADDR_PRIx"=%"PRIx64, __func__,
+                        addr, val);
+    }
+}
+
+static const MemoryRegionOps pnv_pbcq_nest_xscom_ops = {
+    .read = pnv_pbcq_nest_xscom_read,
+    .write = pnv_pbcq_nest_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static const MemoryRegionOps pnv_pbcq_pci_xscom_ops = {
+    .read = pnv_pbcq_pci_xscom_read,
+    .write = pnv_pbcq_pci_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static const MemoryRegionOps pnv_pbcq_spci_xscom_ops = {
+    .read = pnv_pbcq_spci_xscom_read,
+    .write = pnv_pbcq_spci_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static void pnv_pbcq_default_bars(PnvPBCQState *pbcq)
+{
+    uint64_t mm0, mm1, reg;
+    PnvPHB3 *phb = pbcq->phb;
+
+    mm0 = 0x3d00000000000ull + 0x4000000000ull * phb->chip_id +
+            0x1000000000ull * phb->phb_id;
+    mm1 = 0x3ff8000000000ull + 0x0200000000ull * phb->chip_id +
+            0x0080000000ull * phb->phb_id;
+    reg = 0x3fffe40000000ull + 0x0000400000ull * phb->chip_id +
+            0x0000100000ull * phb->phb_id;
+
+    pbcq->nest_regs[PBCQ_NEST_MMIO_BAR0] = mm0 << 14;
+    pbcq->nest_regs[PBCQ_NEST_MMIO_BAR1] = mm1 << 14;
+    pbcq->nest_regs[PBCQ_NEST_PHB_BAR] = reg << 14;
+    pbcq->nest_regs[PBCQ_NEST_MMIO_MASK0] = 0x3fff000000000ull << 14;
+    pbcq->nest_regs[PBCQ_NEST_MMIO_MASK1] = 0x3ffff80000000ull << 14;
+    pbcq->pci_regs[PBCQ_PCI_BAR2] = reg << 14;
+}
+
+static void pnv_pbcq_realize(DeviceState *dev, Error **errp)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(dev);
+    PnvPHB3 *phb;
+    char name[32];
+
+    assert(pbcq->phb);
+    phb = pbcq->phb;
+
+    /* TODO: Fix OPAL to do that: establish default BAR values */
+    pnv_pbcq_default_bars(pbcq);
+
+    /* Initialize the XSCOM region for the PBCQ registers */
+    snprintf(name, sizeof(name), "xscom-pbcq-nest-%d.%d",
+             phb->chip_id, phb->phb_id);
+    pnv_xscom_region_init(&pbcq->xscom_nest_regs, OBJECT(dev),
+                          &pnv_pbcq_nest_xscom_ops, pbcq, name,
+                          PNV_XSCOM_PBCQ_NEST_SIZE);
+    snprintf(name, sizeof(name), "xscom-pbcq-pci-%d.%d",
+             phb->chip_id, phb->phb_id);
+    pnv_xscom_region_init(&pbcq->xscom_pci_regs, OBJECT(dev),
+                          &pnv_pbcq_pci_xscom_ops, pbcq, name,
+                          PNV_XSCOM_PBCQ_PCI_SIZE);
+    snprintf(name, sizeof(name), "xscom-pbcq-spci-%d.%d",
+             phb->chip_id, phb->phb_id);
+    pnv_xscom_region_init(&pbcq->xscom_spci_regs, OBJECT(dev),
+                          &pnv_pbcq_spci_xscom_ops, pbcq, name,
+                          PNV_XSCOM_PBCQ_SPCI_SIZE);
+}
+
+static int pnv_pbcq_dt_xscom(PnvXScomInterface *dev, void *fdt,
+                             int xscom_offset)
+{
+    const char compat[] = "ibm,power8-pbcq";
+    PnvPHB3 *phb = PNV_PBCQ(dev)->phb;
+    char *name;
+    int offset;
+    uint32_t lpc_pcba = PNV_XSCOM_PBCQ_NEST_BASE + 0x400 * phb->phb_id;
+    uint32_t reg[] = {
+        cpu_to_be32(lpc_pcba),
+        cpu_to_be32(PNV_XSCOM_PBCQ_NEST_SIZE),
+        cpu_to_be32(PNV_XSCOM_PBCQ_PCI_BASE + 0x400 * phb->phb_id),
+        cpu_to_be32(PNV_XSCOM_PBCQ_PCI_SIZE),
+        cpu_to_be32(PNV_XSCOM_PBCQ_SPCI_BASE + 0x040 * phb->phb_id),
+        cpu_to_be32(PNV_XSCOM_PBCQ_SPCI_SIZE)
+    };
+
+    name = g_strdup_printf("pbcq@%x", lpc_pcba);
+    offset = fdt_add_subnode(fdt, xscom_offset, name);
+    _FDT(offset);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, offset, "reg", reg, sizeof(reg))));
+
+    _FDT((fdt_setprop_cell(fdt, offset, "ibm,phb-index", phb->phb_id)));
+    _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id", phb->chip_id)));
+    _FDT((fdt_setprop(fdt, offset, "compatible", compat,
+                      sizeof(compat))));
+    return 0;
+}
+
+static void phb3_pbcq_instance_init(Object *obj)
+{
+    PnvPBCQState *pbcq = PNV_PBCQ(obj);
+
+    object_property_add_link(obj, "phb", TYPE_PNV_PHB3,
+                             (Object **)&pbcq->phb,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_STRONG,
+                             &error_abort);
+}
+
+static void pnv_pbcq_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PnvXScomInterfaceClass *xdc = PNV_XSCOM_INTERFACE_CLASS(klass);
+
+    xdc->dt_xscom = pnv_pbcq_dt_xscom;
+
+    dc->realize = pnv_pbcq_realize;
+    dc->user_creatable = false;
+}
+
+static const TypeInfo pnv_pbcq_type_info = {
+    .name          = TYPE_PNV_PBCQ,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(PnvPBCQState),
+    .instance_init = phb3_pbcq_instance_init,
+    .class_init    = pnv_pbcq_class_init,
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_PNV_XSCOM_INTERFACE },
+        { }
+    }
+};
+
+static void pnv_pbcq_register_types(void)
+{
+    type_register_static(&pnv_pbcq_type_info);
+}
+
+type_init(pnv_pbcq_register_types)
diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c
new file mode 100644
index 0000000..23cf093
--- /dev/null
+++ b/hw/pci-host/pnv_phb4.c
@@ -0,0 +1,1439 @@
+/*
+ * QEMU PowerPC PowerNV (POWER9) PHB4 model
+ *
+ * Copyright (c) 2018-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/visitor.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "monitor/monitor.h"
+#include "target/ppc/cpu.h"
+#include "hw/pci-host/pnv_phb4_regs.h"
+#include "hw/pci-host/pnv_phb4.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/ppc/pnv.h"
+#include "hw/ppc/pnv_xscom.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+
+#define phb_error(phb, fmt, ...)                                        \
+    qemu_log_mask(LOG_GUEST_ERROR, "phb4[%d:%d]: " fmt "\n",            \
+                  (phb)->chip_id, (phb)->phb_id, ## __VA_ARGS__)
+
+/*
+ * QEMU version of the GETFIELD/SETFIELD macros
+ *
+ * These are common with the PnvXive model.
+ */
+static inline uint64_t GETFIELD(uint64_t mask, uint64_t word)
+{
+    return (word & mask) >> ctz64(mask);
+}
+
+static inline uint64_t SETFIELD(uint64_t mask, uint64_t word,
+                                uint64_t value)
+{
+    return (word & ~mask) | ((value << ctz64(mask)) & mask);
+}
+
+static PCIDevice *pnv_phb4_find_cfg_dev(PnvPHB4 *phb)
+{
+    PCIHostState *pci = PCI_HOST_BRIDGE(phb);
+    uint64_t addr = phb->regs[PHB_CONFIG_ADDRESS >> 3];
+    uint8_t bus, devfn;
+
+    if (!(addr >> 63)) {
+        return NULL;
+    }
+    bus = (addr >> 52) & 0xff;
+    devfn = (addr >> 44) & 0xff;
+
+    /* We don't access the root complex this way */
+    if (bus == 0 && devfn == 0) {
+        return NULL;
+    }
+    return pci_find_device(pci->bus, bus, devfn);
+}
+
+/*
+ * The CONFIG_DATA register expects little endian accesses, but as the
+ * region is big endian, we have to swap the value.
+ */
+static void pnv_phb4_config_write(PnvPHB4 *phb, unsigned off,
+                                  unsigned size, uint64_t val)
+{
+    uint32_t cfg_addr, limit;
+    PCIDevice *pdev;
+
+    pdev = pnv_phb4_find_cfg_dev(phb);
+    if (!pdev) {
+        return;
+    }
+    cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xffc;
+    cfg_addr |= off;
+    limit = pci_config_size(pdev);
+    if (limit <= cfg_addr) {
+        /*
+         * conventional pci device can be behind pcie-to-pci bridge.
+         * 256 <= addr < 4K has no effects.
+         */
+        return;
+    }
+    switch (size) {
+    case 1:
+        break;
+    case 2:
+        val = bswap16(val);
+        break;
+    case 4:
+        val = bswap32(val);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    pci_host_config_write_common(pdev, cfg_addr, limit, val, size);
+}
+
+static uint64_t pnv_phb4_config_read(PnvPHB4 *phb, unsigned off,
+                                     unsigned size)
+{
+    uint32_t cfg_addr, limit;
+    PCIDevice *pdev;
+    uint64_t val;
+
+    pdev = pnv_phb4_find_cfg_dev(phb);
+    if (!pdev) {
+        return ~0ull;
+    }
+    cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xffc;
+    cfg_addr |= off;
+    limit = pci_config_size(pdev);
+    if (limit <= cfg_addr) {
+        /*
+         * conventional pci device can be behind pcie-to-pci bridge.
+         * 256 <= addr < 4K has no effects.
+         */
+        return ~0ull;
+    }
+    val = pci_host_config_read_common(pdev, cfg_addr, limit, size);
+    switch (size) {
+    case 1:
+        return val;
+    case 2:
+        return bswap16(val);
+    case 4:
+        return bswap32(val);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/*
+ * Root complex register accesses are memory mapped.
+ */
+static void pnv_phb4_rc_config_write(PnvPHB4 *phb, unsigned off,
+                                     unsigned size, uint64_t val)
+{
+    PCIHostState *pci = PCI_HOST_BRIDGE(phb);
+    PCIDevice *pdev;
+
+    if (size != 4) {
+        phb_error(phb, "rc_config_write invalid size %d\n", size);
+        return;
+    }
+
+    pdev = pci_find_device(pci->bus, 0, 0);
+    assert(pdev);
+
+    pci_host_config_write_common(pdev, off, PHB_RC_CONFIG_SIZE,
+                                 bswap32(val), 4);
+}
+
+static uint64_t pnv_phb4_rc_config_read(PnvPHB4 *phb, unsigned off,
+                                        unsigned size)
+{
+    PCIHostState *pci = PCI_HOST_BRIDGE(phb);
+    PCIDevice *pdev;
+    uint64_t val;
+
+    if (size != 4) {
+        phb_error(phb, "rc_config_read invalid size %d\n", size);
+        return ~0ull;
+    }
+
+    pdev = pci_find_device(pci->bus, 0, 0);
+    assert(pdev);
+
+    val = pci_host_config_read_common(pdev, off, PHB_RC_CONFIG_SIZE, 4);
+    return bswap32(val);
+}
+
+static void pnv_phb4_check_mbt(PnvPHB4 *phb, uint32_t index)
+{
+    uint64_t base, start, size, mbe0, mbe1;
+    MemoryRegion *parent;
+    char name[64];
+
+    /* Unmap first */
+    if (memory_region_is_mapped(&phb->mr_mmio[index])) {
+        /* Should we destroy it in RCU friendly way... ? */
+        memory_region_del_subregion(phb->mr_mmio[index].container,
+                                    &phb->mr_mmio[index]);
+    }
+
+    /* Get table entry */
+    mbe0 = phb->ioda_MBT[(index << 1)];
+    mbe1 = phb->ioda_MBT[(index << 1) + 1];
+
+    if (!(mbe0 & IODA3_MBT0_ENABLE)) {
+        return;
+    }
+
+    /* Grab geometry from registers */
+    base = GETFIELD(IODA3_MBT0_BASE_ADDR, mbe0) << 12;
+    size = GETFIELD(IODA3_MBT1_MASK, mbe1) << 12;
+    size |= 0xff00000000000000ull;
+    size = ~size + 1;
+
+    /* Calculate PCI side start address based on M32/M64 window type */
+    if (mbe0 & IODA3_MBT0_TYPE_M32) {
+        start = phb->regs[PHB_M32_START_ADDR >> 3];
+        if ((start + size) > 0x100000000ull) {
+            phb_error(phb, "M32 set beyond 4GB boundary !");
+            size = 0x100000000 - start;
+        }
+    } else {
+        start = base | (phb->regs[PHB_M64_UPPER_BITS >> 3]);
+    }
+
+    /* TODO: Figure out how to implemet/decode AOMASK */
+
+    /* Check if it matches an enabled MMIO region in the PEC stack */
+    if (memory_region_is_mapped(&phb->stack->mmbar0) &&
+        base >= phb->stack->mmio0_base &&
+        (base + size) <= (phb->stack->mmio0_base + phb->stack->mmio0_size)) {
+        parent = &phb->stack->mmbar0;
+        base -= phb->stack->mmio0_base;
+    } else if (memory_region_is_mapped(&phb->stack->mmbar1) &&
+        base >= phb->stack->mmio1_base &&
+        (base + size) <= (phb->stack->mmio1_base + phb->stack->mmio1_size)) {
+        parent = &phb->stack->mmbar1;
+        base -= phb->stack->mmio1_base;
+    } else {
+        phb_error(phb, "PHB MBAR %d out of parent bounds", index);
+        return;
+    }
+
+    /* Create alias (better name ?) */
+    snprintf(name, sizeof(name), "phb4-mbar%d", index);
+    memory_region_init_alias(&phb->mr_mmio[index], OBJECT(phb), name,
+                             &phb->pci_mmio, start, size);
+    memory_region_add_subregion(parent, base, &phb->mr_mmio[index]);
+}
+
+static void pnv_phb4_check_all_mbt(PnvPHB4 *phb)
+{
+    uint64_t i;
+    uint32_t num_windows = phb->big_phb ? PNV_PHB4_MAX_MMIO_WINDOWS :
+        PNV_PHB4_MIN_MMIO_WINDOWS;
+
+    for (i = 0; i < num_windows; i++) {
+        pnv_phb4_check_mbt(phb, i);
+    }
+}
+
+static uint64_t *pnv_phb4_ioda_access(PnvPHB4 *phb,
+                                      unsigned *out_table, unsigned *out_idx)
+{
+    uint64_t adreg = phb->regs[PHB_IODA_ADDR >> 3];
+    unsigned int index = GETFIELD(PHB_IODA_AD_TADR, adreg);
+    unsigned int table = GETFIELD(PHB_IODA_AD_TSEL, adreg);
+    unsigned int mask;
+    uint64_t *tptr = NULL;
+
+    switch (table) {
+    case IODA3_TBL_LIST:
+        tptr = phb->ioda_LIST;
+        mask = 7;
+        break;
+    case IODA3_TBL_MIST:
+        tptr = phb->ioda_MIST;
+        mask = phb->big_phb ? PNV_PHB4_MAX_MIST : (PNV_PHB4_MAX_MIST >> 1);
+        mask -= 1;
+        break;
+    case IODA3_TBL_RCAM:
+        mask = phb->big_phb ? 127 : 63;
+        break;
+    case IODA3_TBL_MRT:
+        mask = phb->big_phb ? 15 : 7;
+        break;
+    case IODA3_TBL_PESTA:
+    case IODA3_TBL_PESTB:
+        mask = phb->big_phb ? PNV_PHB4_MAX_PEs : (PNV_PHB4_MAX_PEs >> 1);
+        mask -= 1;
+        break;
+    case IODA3_TBL_TVT:
+        tptr = phb->ioda_TVT;
+        mask = phb->big_phb ? PNV_PHB4_MAX_TVEs : (PNV_PHB4_MAX_TVEs >> 1);
+        mask -= 1;
+        break;
+    case IODA3_TBL_TCR:
+    case IODA3_TBL_TDR:
+        mask = phb->big_phb ? 1023 : 511;
+        break;
+    case IODA3_TBL_MBT:
+        tptr = phb->ioda_MBT;
+        mask = phb->big_phb ? PNV_PHB4_MAX_MBEs : (PNV_PHB4_MAX_MBEs >> 1);
+        mask -= 1;
+        break;
+    case IODA3_TBL_MDT:
+        tptr = phb->ioda_MDT;
+        mask = phb->big_phb ? PNV_PHB4_MAX_PEs : (PNV_PHB4_MAX_PEs >> 1);
+        mask -= 1;
+        break;
+    case IODA3_TBL_PEEV:
+        tptr = phb->ioda_PEEV;
+        mask = phb->big_phb ? PNV_PHB4_MAX_PEEVs : (PNV_PHB4_MAX_PEEVs >> 1);
+        mask -= 1;
+        break;
+    default:
+        phb_error(phb, "invalid IODA table %d", table);
+        return NULL;
+    }
+    index &= mask;
+    if (out_idx) {
+        *out_idx = index;
+    }
+    if (out_table) {
+        *out_table = table;
+    }
+    if (tptr) {
+        tptr += index;
+    }
+    if (adreg & PHB_IODA_AD_AUTOINC) {
+        index = (index + 1) & mask;
+        adreg = SETFIELD(PHB_IODA_AD_TADR, adreg, index);
+    }
+
+    phb->regs[PHB_IODA_ADDR >> 3] = adreg;
+    return tptr;
+}
+
+static uint64_t pnv_phb4_ioda_read(PnvPHB4 *phb)
+{
+    unsigned table, idx;
+    uint64_t *tptr;
+
+    tptr = pnv_phb4_ioda_access(phb, &table, &idx);
+    if (!tptr) {
+        /* Special PESTA case */
+        if (table == IODA3_TBL_PESTA) {
+            return ((uint64_t)(phb->ioda_PEST_AB[idx] & 1)) << 63;
+        } else if (table == IODA3_TBL_PESTB) {
+            return ((uint64_t)(phb->ioda_PEST_AB[idx] & 2)) << 62;
+        }
+        /* Return 0 on unsupported tables, not ff's */
+        return 0;
+    }
+    return *tptr;
+}
+
+static void pnv_phb4_ioda_write(PnvPHB4 *phb, uint64_t val)
+{
+    unsigned table, idx;
+    uint64_t *tptr;
+
+    tptr = pnv_phb4_ioda_access(phb, &table, &idx);
+    if (!tptr) {
+        /* Special PESTA case */
+        if (table == IODA3_TBL_PESTA) {
+            phb->ioda_PEST_AB[idx] &= ~1;
+            phb->ioda_PEST_AB[idx] |= (val >> 63) & 1;
+        } else if (table == IODA3_TBL_PESTB) {
+            phb->ioda_PEST_AB[idx] &= ~2;
+            phb->ioda_PEST_AB[idx] |= (val >> 62) & 2;
+        }
+        return;
+    }
+
+    /* Handle side effects */
+    switch (table) {
+    case IODA3_TBL_LIST:
+        break;
+    case IODA3_TBL_MIST: {
+        /* Special mask for MIST partial write */
+        uint64_t adreg = phb->regs[PHB_IODA_ADDR >> 3];
+        uint32_t mmask = GETFIELD(PHB_IODA_AD_MIST_PWV, adreg);
+        uint64_t v = *tptr;
+        if (mmask == 0) {
+            mmask = 0xf;
+        }
+        if (mmask & 8) {
+            v &= 0x0000ffffffffffffull;
+            v |= 0xcfff000000000000ull & val;
+        }
+        if (mmask & 4) {
+            v &= 0xffff0000ffffffffull;
+            v |= 0x0000cfff00000000ull & val;
+        }
+        if (mmask & 2) {
+            v &= 0xffffffff0000ffffull;
+            v |= 0x00000000cfff0000ull & val;
+        }
+        if (mmask & 1) {
+            v &= 0xffffffffffff0000ull;
+            v |= 0x000000000000cfffull & val;
+        }
+        *tptr = val;
+        break;
+    }
+    case IODA3_TBL_MBT:
+        *tptr = val;
+
+        /* Copy accross the valid bit to the other half */
+        phb->ioda_MBT[idx ^ 1] &= 0x7fffffffffffffffull;
+        phb->ioda_MBT[idx ^ 1] |= 0x8000000000000000ull & val;
+
+        /* Update mappings */
+        pnv_phb4_check_mbt(phb, idx >> 1);
+        break;
+    default:
+        *tptr = val;
+    }
+}
+
+static void pnv_phb4_rtc_invalidate(PnvPHB4 *phb, uint64_t val)
+{
+    PnvPhb4DMASpace *ds;
+
+    /* Always invalidate all for now ... */
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        ds->pe_num = PHB_INVALID_PE;
+    }
+}
+
+static void pnv_phb4_update_msi_regions(PnvPhb4DMASpace *ds)
+{
+    uint64_t cfg = ds->phb->regs[PHB_PHB4_CONFIG >> 3];
+
+    if (cfg & PHB_PHB4C_32BIT_MSI_EN) {
+        if (!memory_region_is_mapped(MEMORY_REGION(&ds->msi32_mr))) {
+            memory_region_add_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        0xffff0000, &ds->msi32_mr);
+        }
+    } else {
+        if (memory_region_is_mapped(MEMORY_REGION(&ds->msi32_mr))) {
+            memory_region_del_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        &ds->msi32_mr);
+        }
+    }
+
+    if (cfg & PHB_PHB4C_64BIT_MSI_EN) {
+        if (!memory_region_is_mapped(MEMORY_REGION(&ds->msi64_mr))) {
+            memory_region_add_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        (1ull << 60), &ds->msi64_mr);
+        }
+    } else {
+        if (memory_region_is_mapped(MEMORY_REGION(&ds->msi64_mr))) {
+            memory_region_del_subregion(MEMORY_REGION(&ds->dma_mr),
+                                        &ds->msi64_mr);
+        }
+    }
+}
+
+static void pnv_phb4_update_all_msi_regions(PnvPHB4 *phb)
+{
+    PnvPhb4DMASpace *ds;
+
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        pnv_phb4_update_msi_regions(ds);
+    }
+}
+
+static void pnv_phb4_update_xsrc(PnvPHB4 *phb)
+{
+    int shift, flags, i, lsi_base;
+    XiveSource *xsrc = &phb->xsrc;
+
+    /* The XIVE source characteristics can be set at run time */
+    if (phb->regs[PHB_CTRLR >> 3] & PHB_CTRLR_IRQ_PGSZ_64K) {
+        shift = XIVE_ESB_64K;
+    } else {
+        shift = XIVE_ESB_4K;
+    }
+    if (phb->regs[PHB_CTRLR >> 3] & PHB_CTRLR_IRQ_STORE_EOI) {
+        flags = XIVE_SRC_STORE_EOI;
+    } else {
+        flags = 0;
+    }
+
+    phb->xsrc.esb_shift = shift;
+    phb->xsrc.esb_flags = flags;
+
+    lsi_base = GETFIELD(PHB_LSI_SRC_ID, phb->regs[PHB_LSI_SOURCE_ID >> 3]);
+    lsi_base <<= 3;
+
+    /* TODO: handle reset values of PHB_LSI_SRC_ID */
+    if (!lsi_base) {
+        return;
+    }
+
+    /* TODO: need a xive_source_irq_reset_lsi() */
+    bitmap_zero(xsrc->lsi_map, xsrc->nr_irqs);
+
+    for (i = 0; i < xsrc->nr_irqs; i++) {
+        bool msi = (i < lsi_base || i >= (lsi_base + 8));
+        if (!msi) {
+            xive_source_irq_set_lsi(xsrc, i);
+        }
+    }
+}
+
+static void pnv_phb4_reg_write(void *opaque, hwaddr off, uint64_t val,
+                               unsigned size)
+{
+    PnvPHB4 *phb = PNV_PHB4(opaque);
+    bool changed;
+
+    /* Special case outbound configuration data */
+    if ((off & 0xfffc) == PHB_CONFIG_DATA) {
+        pnv_phb4_config_write(phb, off & 0x3, size, val);
+        return;
+    }
+
+    /* Special case RC configuration space */
+    if ((off & 0xf800) == PHB_RC_CONFIG_BASE) {
+        pnv_phb4_rc_config_write(phb, off & 0x7ff, size, val);
+        return;
+    }
+
+    /* Other registers are 64-bit only */
+    if (size != 8 || off & 0x7) {
+        phb_error(phb, "Invalid register access, offset: 0x%"PRIx64" size: %d",
+                   off, size);
+        return;
+    }
+
+    /* Handle masking */
+    switch (off) {
+    case PHB_LSI_SOURCE_ID:
+        val &= PHB_LSI_SRC_ID;
+        break;
+    case PHB_M64_UPPER_BITS:
+        val &= 0xff00000000000000ull;
+        break;
+    /* TCE Kill */
+    case PHB_TCE_KILL:
+        /* Clear top 3 bits which HW does to indicate successful queuing */
+        val &= ~(PHB_TCE_KILL_ALL | PHB_TCE_KILL_PE | PHB_TCE_KILL_ONE);
+        break;
+    case PHB_Q_DMA_R:
+        /*
+         * This is enough logic to make SW happy but we aren't
+         * actually quiescing the DMAs
+         */
+        if (val & PHB_Q_DMA_R_AUTORESET) {
+            val = 0;
+        } else {
+            val &= PHB_Q_DMA_R_QUIESCE_DMA;
+        }
+        break;
+    /* LEM stuff */
+    case PHB_LEM_FIR_AND_MASK:
+        phb->regs[PHB_LEM_FIR_ACCUM >> 3] &= val;
+        return;
+    case PHB_LEM_FIR_OR_MASK:
+        phb->regs[PHB_LEM_FIR_ACCUM >> 3] |= val;
+        return;
+    case PHB_LEM_ERROR_AND_MASK:
+        phb->regs[PHB_LEM_ERROR_MASK >> 3] &= val;
+        return;
+    case PHB_LEM_ERROR_OR_MASK:
+        phb->regs[PHB_LEM_ERROR_MASK >> 3] |= val;
+        return;
+    case PHB_LEM_WOF:
+        val = 0;
+        break;
+    /* TODO: More regs ..., maybe create a table with masks... */
+
+    /* Read only registers */
+    case PHB_CPU_LOADSTORE_STATUS:
+    case PHB_ETU_ERR_SUMMARY:
+    case PHB_PHB4_GEN_CAP:
+    case PHB_PHB4_TCE_CAP:
+    case PHB_PHB4_IRQ_CAP:
+    case PHB_PHB4_EEH_CAP:
+        return;
+    }
+
+    /* Record whether it changed */
+    changed = phb->regs[off >> 3] != val;
+
+    /* Store in register cache first */
+    phb->regs[off >> 3] = val;
+
+    /* Handle side effects */
+    switch (off) {
+    case PHB_PHB4_CONFIG:
+        if (changed) {
+            pnv_phb4_update_all_msi_regions(phb);
+        }
+        break;
+    case PHB_M32_START_ADDR:
+    case PHB_M64_UPPER_BITS:
+        if (changed) {
+            pnv_phb4_check_all_mbt(phb);
+        }
+        break;
+
+    /* IODA table accesses */
+    case PHB_IODA_DATA0:
+        pnv_phb4_ioda_write(phb, val);
+        break;
+
+    /* RTC invalidation */
+    case PHB_RTC_INVALIDATE:
+        pnv_phb4_rtc_invalidate(phb, val);
+        break;
+
+    /* PHB Control (Affects XIVE source) */
+    case PHB_CTRLR:
+    case PHB_LSI_SOURCE_ID:
+        pnv_phb4_update_xsrc(phb);
+        break;
+
+    /* Silent simple writes */
+    case PHB_ASN_CMPM:
+    case PHB_CONFIG_ADDRESS:
+    case PHB_IODA_ADDR:
+    case PHB_TCE_KILL:
+    case PHB_TCE_SPEC_CTL:
+    case PHB_PEST_BAR:
+    case PHB_PELTV_BAR:
+    case PHB_RTT_BAR:
+    case PHB_LEM_FIR_ACCUM:
+    case PHB_LEM_ERROR_MASK:
+    case PHB_LEM_ACTION0:
+    case PHB_LEM_ACTION1:
+    case PHB_TCE_TAG_ENABLE:
+    case PHB_INT_NOTIFY_ADDR:
+    case PHB_INT_NOTIFY_INDEX:
+    case PHB_DMARD_SYNC:
+       break;
+
+    /* Noise on anything else */
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4: reg_write 0x%"PRIx64"=%"PRIx64"\n",
+                      off, val);
+    }
+}
+
+static uint64_t pnv_phb4_reg_read(void *opaque, hwaddr off, unsigned size)
+{
+    PnvPHB4 *phb = PNV_PHB4(opaque);
+    uint64_t val;
+
+    if ((off & 0xfffc) == PHB_CONFIG_DATA) {
+        return pnv_phb4_config_read(phb, off & 0x3, size);
+    }
+
+    /* Special case RC configuration space */
+    if ((off & 0xf800) == PHB_RC_CONFIG_BASE) {
+        return pnv_phb4_rc_config_read(phb, off & 0x7ff, size);
+    }
+
+    /* Other registers are 64-bit only */
+    if (size != 8 || off & 0x7) {
+        phb_error(phb, "Invalid register access, offset: 0x%"PRIx64" size: %d",
+                   off, size);
+        return ~0ull;
+    }
+
+    /* Default read from cache */
+    val = phb->regs[off >> 3];
+
+    switch (off) {
+    case PHB_VERSION:
+        return phb->version;
+
+        /* Read-only */
+    case PHB_PHB4_GEN_CAP:
+        return 0xe4b8000000000000ull;
+    case PHB_PHB4_TCE_CAP:
+        return phb->big_phb ? 0x4008440000000400ull : 0x2008440000000200ull;
+    case PHB_PHB4_IRQ_CAP:
+        return phb->big_phb ? 0x0800000000001000ull : 0x0800000000000800ull;
+    case PHB_PHB4_EEH_CAP:
+        return phb->big_phb ? 0x2000000000000000ull : 0x1000000000000000ull;
+
+    /* IODA table accesses */
+    case PHB_IODA_DATA0:
+        return pnv_phb4_ioda_read(phb);
+
+    /* Link training always appears trained */
+    case PHB_PCIE_DLP_TRAIN_CTL:
+        /* TODO: Do something sensible with speed ? */
+        return PHB_PCIE_DLP_INBAND_PRESENCE | PHB_PCIE_DLP_TL_LINKACT;
+
+    /* DMA read sync: make it look like it's complete */
+    case PHB_DMARD_SYNC:
+        return PHB_DMARD_SYNC_COMPLETE;
+
+    /* Silent simple reads */
+    case PHB_LSI_SOURCE_ID:
+    case PHB_CPU_LOADSTORE_STATUS:
+    case PHB_ASN_CMPM:
+    case PHB_PHB4_CONFIG:
+    case PHB_M32_START_ADDR:
+    case PHB_CONFIG_ADDRESS:
+    case PHB_IODA_ADDR:
+    case PHB_RTC_INVALIDATE:
+    case PHB_TCE_KILL:
+    case PHB_TCE_SPEC_CTL:
+    case PHB_PEST_BAR:
+    case PHB_PELTV_BAR:
+    case PHB_RTT_BAR:
+    case PHB_M64_UPPER_BITS:
+    case PHB_CTRLR:
+    case PHB_LEM_FIR_ACCUM:
+    case PHB_LEM_ERROR_MASK:
+    case PHB_LEM_ACTION0:
+    case PHB_LEM_ACTION1:
+    case PHB_TCE_TAG_ENABLE:
+    case PHB_INT_NOTIFY_ADDR:
+    case PHB_INT_NOTIFY_INDEX:
+    case PHB_Q_DMA_R:
+    case PHB_ETU_ERR_SUMMARY:
+        break;
+
+    /* Noise on anything else */
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4: reg_read 0x%"PRIx64"=%"PRIx64"\n",
+                      off, val);
+    }
+    return val;
+}
+
+static const MemoryRegionOps pnv_phb4_reg_ops = {
+    .read = pnv_phb4_reg_read,
+    .write = pnv_phb4_reg_write,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static uint64_t pnv_phb4_xscom_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PnvPHB4 *phb = PNV_PHB4(opaque);
+    uint32_t reg = addr >> 3;
+    uint64_t val;
+    hwaddr offset;
+
+    switch (reg) {
+    case PHB_SCOM_HV_IND_ADDR:
+        return phb->scom_hv_ind_addr_reg;
+
+    case PHB_SCOM_HV_IND_DATA:
+        if (!(phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_VALID)) {
+            phb_error(phb, "Invalid indirect address");
+            return ~0ull;
+        }
+        size = (phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_4B) ? 4 : 8;
+        offset = GETFIELD(PHB_SCOM_HV_IND_ADDR_ADDR, phb->scom_hv_ind_addr_reg);
+        val = pnv_phb4_reg_read(phb, offset, size);
+        if (phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_AUTOINC) {
+            offset += size;
+            offset &= 0x3fff;
+            phb->scom_hv_ind_addr_reg = SETFIELD(PHB_SCOM_HV_IND_ADDR_ADDR,
+                                                 phb->scom_hv_ind_addr_reg,
+                                                 offset);
+        }
+        return val;
+    case PHB_SCOM_ETU_LEM_FIR:
+    case PHB_SCOM_ETU_LEM_FIR_AND:
+    case PHB_SCOM_ETU_LEM_FIR_OR:
+    case PHB_SCOM_ETU_LEM_FIR_MSK:
+    case PHB_SCOM_ETU_LEM_ERR_MSK_AND:
+    case PHB_SCOM_ETU_LEM_ERR_MSK_OR:
+    case PHB_SCOM_ETU_LEM_ACT0:
+    case PHB_SCOM_ETU_LEM_ACT1:
+    case PHB_SCOM_ETU_LEM_WOF:
+        offset = ((reg - PHB_SCOM_ETU_LEM_FIR) << 3) + PHB_LEM_FIR_ACCUM;
+        return pnv_phb4_reg_read(phb, offset, size);
+    case PHB_SCOM_ETU_PMON_CONFIG:
+    case PHB_SCOM_ETU_PMON_CTR0:
+    case PHB_SCOM_ETU_PMON_CTR1:
+    case PHB_SCOM_ETU_PMON_CTR2:
+    case PHB_SCOM_ETU_PMON_CTR3:
+        offset = ((reg - PHB_SCOM_ETU_PMON_CONFIG) << 3) + PHB_PERFMON_CONFIG;
+        return pnv_phb4_reg_read(phb, offset, size);
+
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4: xscom_read 0x%"HWADDR_PRIx"\n", addr);
+        return ~0ull;
+    }
+}
+
+static void pnv_phb4_xscom_write(void *opaque, hwaddr addr,
+                                 uint64_t val, unsigned size)
+{
+    PnvPHB4 *phb = PNV_PHB4(opaque);
+    uint32_t reg = addr >> 3;
+    hwaddr offset;
+
+    switch (reg) {
+    case PHB_SCOM_HV_IND_ADDR:
+        phb->scom_hv_ind_addr_reg = val & 0xe000000000001fff;
+        break;
+    case PHB_SCOM_HV_IND_DATA:
+        if (!(phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_VALID)) {
+            phb_error(phb, "Invalid indirect address");
+            break;
+        }
+        size = (phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_4B) ? 4 : 8;
+        offset = GETFIELD(PHB_SCOM_HV_IND_ADDR_ADDR, phb->scom_hv_ind_addr_reg);
+        pnv_phb4_reg_write(phb, offset, val, size);
+        if (phb->scom_hv_ind_addr_reg & PHB_SCOM_HV_IND_ADDR_AUTOINC) {
+            offset += size;
+            offset &= 0x3fff;
+            phb->scom_hv_ind_addr_reg = SETFIELD(PHB_SCOM_HV_IND_ADDR_ADDR,
+                                                 phb->scom_hv_ind_addr_reg,
+                                                 offset);
+        }
+        break;
+    case PHB_SCOM_ETU_LEM_FIR:
+    case PHB_SCOM_ETU_LEM_FIR_AND:
+    case PHB_SCOM_ETU_LEM_FIR_OR:
+    case PHB_SCOM_ETU_LEM_FIR_MSK:
+    case PHB_SCOM_ETU_LEM_ERR_MSK_AND:
+    case PHB_SCOM_ETU_LEM_ERR_MSK_OR:
+    case PHB_SCOM_ETU_LEM_ACT0:
+    case PHB_SCOM_ETU_LEM_ACT1:
+    case PHB_SCOM_ETU_LEM_WOF:
+        offset = ((reg - PHB_SCOM_ETU_LEM_FIR) << 3) + PHB_LEM_FIR_ACCUM;
+        pnv_phb4_reg_write(phb, offset, val, size);
+        break;
+    case PHB_SCOM_ETU_PMON_CONFIG:
+    case PHB_SCOM_ETU_PMON_CTR0:
+    case PHB_SCOM_ETU_PMON_CTR1:
+    case PHB_SCOM_ETU_PMON_CTR2:
+    case PHB_SCOM_ETU_PMON_CTR3:
+        offset = ((reg - PHB_SCOM_ETU_PMON_CONFIG) << 3) + PHB_PERFMON_CONFIG;
+        pnv_phb4_reg_write(phb, offset, val, size);
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4: xscom_write 0x%"HWADDR_PRIx
+                      "=%"PRIx64"\n", addr, val);
+    }
+}
+
+const MemoryRegionOps pnv_phb4_xscom_ops = {
+    .read = pnv_phb4_xscom_read,
+    .write = pnv_phb4_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static int pnv_phb4_map_irq(PCIDevice *pci_dev, int irq_num)
+{
+    /* Check that out properly ... */
+    return irq_num & 3;
+}
+
+static void pnv_phb4_set_irq(void *opaque, int irq_num, int level)
+{
+    PnvPHB4 *phb = PNV_PHB4(opaque);
+    uint32_t lsi_base;
+
+    /* LSI only ... */
+    if (irq_num > 3) {
+        phb_error(phb, "IRQ %x is not an LSI", irq_num);
+    }
+    lsi_base = GETFIELD(PHB_LSI_SRC_ID, phb->regs[PHB_LSI_SOURCE_ID >> 3]);
+    lsi_base <<= 3;
+    qemu_set_irq(phb->qirqs[lsi_base + irq_num], level);
+}
+
+static bool pnv_phb4_resolve_pe(PnvPhb4DMASpace *ds)
+{
+    uint64_t rtt, addr;
+    uint16_t rte;
+    int bus_num;
+    int num_PEs;
+
+    /* Already resolved ? */
+    if (ds->pe_num != PHB_INVALID_PE) {
+        return true;
+    }
+
+    /* We need to lookup the RTT */
+    rtt = ds->phb->regs[PHB_RTT_BAR >> 3];
+    if (!(rtt & PHB_RTT_BAR_ENABLE)) {
+        phb_error(ds->phb, "DMA with RTT BAR disabled !");
+        /* Set error bits ? fence ? ... */
+        return false;
+    }
+
+    /* Read RTE */
+    bus_num = pci_bus_num(ds->bus);
+    addr = rtt & PHB_RTT_BASE_ADDRESS_MASK;
+    addr += 2 * ((bus_num << 8) | ds->devfn);
+    if (dma_memory_read(&address_space_memory, addr, &rte, sizeof(rte))) {
+        phb_error(ds->phb, "Failed to read RTT entry at 0x%"PRIx64, addr);
+        /* Set error bits ? fence ? ... */
+        return false;
+    }
+    rte = be16_to_cpu(rte);
+
+    /* Fail upon reading of invalid PE# */
+    num_PEs = ds->phb->big_phb ? PNV_PHB4_MAX_PEs : (PNV_PHB4_MAX_PEs >> 1);
+    if (rte >= num_PEs) {
+        phb_error(ds->phb, "RTE for RID 0x%x invalid (%04x", ds->devfn, rte);
+        rte &= num_PEs - 1;
+    }
+    ds->pe_num = rte;
+    return true;
+}
+
+static void pnv_phb4_translate_tve(PnvPhb4DMASpace *ds, hwaddr addr,
+                                   bool is_write, uint64_t tve,
+                                   IOMMUTLBEntry *tlb)
+{
+    uint64_t tta = GETFIELD(IODA3_TVT_TABLE_ADDR, tve);
+    int32_t  lev = GETFIELD(IODA3_TVT_NUM_LEVELS, tve);
+    uint32_t tts = GETFIELD(IODA3_TVT_TCE_TABLE_SIZE, tve);
+    uint32_t tps = GETFIELD(IODA3_TVT_IO_PSIZE, tve);
+
+    /* Invalid levels */
+    if (lev > 4) {
+        phb_error(ds->phb, "Invalid #levels in TVE %d", lev);
+        return;
+    }
+
+    /* Invalid entry */
+    if (tts == 0) {
+        phb_error(ds->phb, "Access to invalid TVE");
+        return;
+    }
+
+    /* IO Page Size of 0 means untranslated, else use TCEs */
+    if (tps == 0) {
+        /* TODO: Handle boundaries */
+
+        /* Use 4k pages like q35 ... for now */
+        tlb->iova = addr & 0xfffffffffffff000ull;
+        tlb->translated_addr = addr & 0x0003fffffffff000ull;
+        tlb->addr_mask = 0xfffull;
+        tlb->perm = IOMMU_RW;
+    } else {
+        uint32_t tce_shift, tbl_shift, sh;
+        uint64_t base, taddr, tce, tce_mask;
+
+        /* Address bits per bottom level TCE entry */
+        tce_shift = tps + 11;
+
+        /* Address bits per table level */
+        tbl_shift = tts + 8;
+
+        /* Top level table base address */
+        base = tta << 12;
+
+        /* Total shift to first level */
+        sh = tbl_shift * lev + tce_shift;
+
+        /* TODO: Limit to support IO page sizes */
+
+        /* TODO: Multi-level untested */
+        while ((lev--) >= 0) {
+            /* Grab the TCE address */
+            taddr = base | (((addr >> sh) & ((1ul << tbl_shift) - 1)) << 3);
+            if (dma_memory_read(&address_space_memory, taddr, &tce,
+                                sizeof(tce))) {
+                phb_error(ds->phb, "Failed to read TCE at 0x%"PRIx64, taddr);
+                return;
+            }
+            tce = be64_to_cpu(tce);
+
+            /* Check permission for indirect TCE */
+            if ((lev >= 0) && !(tce & 3)) {
+                phb_error(ds->phb, "Invalid indirect TCE at 0x%"PRIx64, taddr);
+                phb_error(ds->phb, " xlate %"PRIx64":%c TVE=%"PRIx64, addr,
+                           is_write ? 'W' : 'R', tve);
+                phb_error(ds->phb, " tta=%"PRIx64" lev=%d tts=%d tps=%d",
+                           tta, lev, tts, tps);
+                return;
+            }
+            sh -= tbl_shift;
+            base = tce & ~0xfffull;
+        }
+
+        /* We exit the loop with TCE being the final TCE */
+        tce_mask = ~((1ull << tce_shift) - 1);
+        tlb->iova = addr & tce_mask;
+        tlb->translated_addr = tce & tce_mask;
+        tlb->addr_mask = ~tce_mask;
+        tlb->perm = tce & 3;
+        if ((is_write & !(tce & 2)) || ((!is_write) && !(tce & 1))) {
+            phb_error(ds->phb, "TCE access fault at 0x%"PRIx64, taddr);
+            phb_error(ds->phb, " xlate %"PRIx64":%c TVE=%"PRIx64, addr,
+                       is_write ? 'W' : 'R', tve);
+            phb_error(ds->phb, " tta=%"PRIx64" lev=%d tts=%d tps=%d",
+                       tta, lev, tts, tps);
+        }
+    }
+}
+
+static IOMMUTLBEntry pnv_phb4_translate_iommu(IOMMUMemoryRegion *iommu,
+                                              hwaddr addr,
+                                              IOMMUAccessFlags flag,
+                                              int iommu_idx)
+{
+    PnvPhb4DMASpace *ds = container_of(iommu, PnvPhb4DMASpace, dma_mr);
+    int tve_sel;
+    uint64_t tve, cfg;
+    IOMMUTLBEntry ret = {
+        .target_as = &address_space_memory,
+        .iova = addr,
+        .translated_addr = 0,
+        .addr_mask = ~(hwaddr)0,
+        .perm = IOMMU_NONE,
+    };
+
+    /* Resolve PE# */
+    if (!pnv_phb4_resolve_pe(ds)) {
+        phb_error(ds->phb, "Failed to resolve PE# for bus @%p (%d) devfn 0x%x",
+                   ds->bus, pci_bus_num(ds->bus), ds->devfn);
+        return ret;
+    }
+
+    /* Check top bits */
+    switch (addr >> 60) {
+    case 00:
+        /* DMA or 32-bit MSI ? */
+        cfg = ds->phb->regs[PHB_PHB4_CONFIG >> 3];
+        if ((cfg & PHB_PHB4C_32BIT_MSI_EN) &&
+            ((addr & 0xffffffffffff0000ull) == 0xffff0000ull)) {
+            phb_error(ds->phb, "xlate on 32-bit MSI region");
+            return ret;
+        }
+        /* Choose TVE XXX Use PHB4 Control Register */
+        tve_sel = (addr >> 59) & 1;
+        tve = ds->phb->ioda_TVT[ds->pe_num * 2 + tve_sel];
+        pnv_phb4_translate_tve(ds, addr, flag & IOMMU_WO, tve, &ret);
+        break;
+    case 01:
+        phb_error(ds->phb, "xlate on 64-bit MSI region");
+        break;
+    default:
+        phb_error(ds->phb, "xlate on unsupported address 0x%"PRIx64, addr);
+    }
+    return ret;
+}
+
+#define TYPE_PNV_PHB4_IOMMU_MEMORY_REGION "pnv-phb4-iommu-memory-region"
+#define PNV_PHB4_IOMMU_MEMORY_REGION(obj) \
+    OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_PNV_PHB4_IOMMU_MEMORY_REGION)
+
+static void pnv_phb4_iommu_memory_region_class_init(ObjectClass *klass,
+                                                    void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = pnv_phb4_translate_iommu;
+}
+
+static const TypeInfo pnv_phb4_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_PNV_PHB4_IOMMU_MEMORY_REGION,
+    .class_init = pnv_phb4_iommu_memory_region_class_init,
+};
+
+/*
+ * MSI/MSIX memory region implementation.
+ * The handler handles both MSI and MSIX.
+ */
+static void pnv_phb4_msi_write(void *opaque, hwaddr addr,
+                               uint64_t data, unsigned size)
+{
+    PnvPhb4DMASpace *ds = opaque;
+    PnvPHB4 *phb = ds->phb;
+
+    uint32_t src = ((addr >> 4) & 0xffff) | (data & 0x1f);
+
+    /* Resolve PE# */
+    if (!pnv_phb4_resolve_pe(ds)) {
+        phb_error(phb, "Failed to resolve PE# for bus @%p (%d) devfn 0x%x",
+                   ds->bus, pci_bus_num(ds->bus), ds->devfn);
+        return;
+    }
+
+    /* TODO: Check it doesn't collide with LSIs */
+    if (src >= phb->xsrc.nr_irqs) {
+        phb_error(phb, "MSI %d out of bounds", src);
+        return;
+    }
+
+    /* TODO: check PE/MSI assignement */
+
+    qemu_irq_pulse(phb->qirqs[src]);
+}
+
+/* There is no .read as the read result is undefined by PCI spec */
+static uint64_t pnv_phb4_msi_read(void *opaque, hwaddr addr, unsigned size)
+{
+    PnvPhb4DMASpace *ds = opaque;
+
+    phb_error(ds->phb, "Invalid MSI read @ 0x%" HWADDR_PRIx, addr);
+    return -1;
+}
+
+static const MemoryRegionOps pnv_phb4_msi_ops = {
+    .read = pnv_phb4_msi_read,
+    .write = pnv_phb4_msi_write,
+    .endianness = DEVICE_LITTLE_ENDIAN
+};
+
+static PnvPhb4DMASpace *pnv_phb4_dma_find(PnvPHB4 *phb, PCIBus *bus, int devfn)
+{
+    PnvPhb4DMASpace *ds;
+
+    QLIST_FOREACH(ds, &phb->dma_spaces, list) {
+        if (ds->bus == bus && ds->devfn == devfn) {
+            break;
+        }
+    }
+    return ds;
+}
+
+static AddressSpace *pnv_phb4_dma_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+    PnvPHB4 *phb = opaque;
+    PnvPhb4DMASpace *ds;
+    char name[32];
+
+    ds = pnv_phb4_dma_find(phb, bus, devfn);
+
+    if (ds == NULL) {
+        ds = g_malloc0(sizeof(PnvPhb4DMASpace));
+        ds->bus = bus;
+        ds->devfn = devfn;
+        ds->pe_num = PHB_INVALID_PE;
+        ds->phb = phb;
+        snprintf(name, sizeof(name), "phb4-%d.%d-iommu", phb->chip_id,
+                 phb->phb_id);
+        memory_region_init_iommu(&ds->dma_mr, sizeof(ds->dma_mr),
+                                 TYPE_PNV_PHB4_IOMMU_MEMORY_REGION,
+                                 OBJECT(phb), name, UINT64_MAX);
+        address_space_init(&ds->dma_as, MEMORY_REGION(&ds->dma_mr),
+                           name);
+        memory_region_init_io(&ds->msi32_mr, OBJECT(phb), &pnv_phb4_msi_ops,
+                              ds, "msi32", 0x10000);
+        memory_region_init_io(&ds->msi64_mr, OBJECT(phb), &pnv_phb4_msi_ops,
+                              ds, "msi64", 0x100000);
+        pnv_phb4_update_msi_regions(ds);
+
+        QLIST_INSERT_HEAD(&phb->dma_spaces, ds, list);
+    }
+    return &ds->dma_as;
+}
+
+static void pnv_phb4_instance_init(Object *obj)
+{
+    PnvPHB4 *phb = PNV_PHB4(obj);
+
+    QLIST_INIT(&phb->dma_spaces);
+
+    /* XIVE interrupt source object */
+    object_initialize_child(obj, "source", &phb->xsrc, sizeof(XiveSource),
+                            TYPE_XIVE_SOURCE, &error_abort, NULL);
+
+    /* Root Port */
+    object_initialize_child(obj, "root", &phb->root, sizeof(phb->root),
+                            TYPE_PNV_PHB4_ROOT_PORT, &error_abort, NULL);
+
+    qdev_prop_set_int32(DEVICE(&phb->root), "addr", PCI_DEVFN(0, 0));
+    qdev_prop_set_bit(DEVICE(&phb->root), "multifunction", false);
+}
+
+static void pnv_phb4_realize(DeviceState *dev, Error **errp)
+{
+    PnvPHB4 *phb = PNV_PHB4(dev);
+    PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+    XiveSource *xsrc = &phb->xsrc;
+    Error *local_err = NULL;
+    int nr_irqs;
+    char name[32];
+
+    assert(phb->stack);
+
+    /* Set the "big_phb" flag */
+    phb->big_phb = phb->phb_id == 0 || phb->phb_id == 3;
+
+    /* Controller Registers */
+    snprintf(name, sizeof(name), "phb4-%d.%d-regs", phb->chip_id,
+             phb->phb_id);
+    memory_region_init_io(&phb->mr_regs, OBJECT(phb), &pnv_phb4_reg_ops, phb,
+                          name, 0x2000);
+
+    /*
+     * PHB4 doesn't support IO space. However, qemu gets very upset if
+     * we don't have an IO region to anchor IO BARs onto so we just
+     * initialize one which we never hook up to anything
+     */
+
+    snprintf(name, sizeof(name), "phb4-%d.%d-pci-io", phb->chip_id,
+             phb->phb_id);
+    memory_region_init(&phb->pci_io, OBJECT(phb), name, 0x10000);
+
+    snprintf(name, sizeof(name), "phb4-%d.%d-pci-mmio", phb->chip_id,
+             phb->phb_id);
+    memory_region_init(&phb->pci_mmio, OBJECT(phb), name,
+                       PCI_MMIO_TOTAL_SIZE);
+
+    pci->bus = pci_register_root_bus(dev, "root-bus",
+                                     pnv_phb4_set_irq, pnv_phb4_map_irq, phb,
+                                     &phb->pci_mmio, &phb->pci_io,
+                                     0, 4, TYPE_PNV_PHB4_ROOT_BUS);
+    pci_setup_iommu(pci->bus, pnv_phb4_dma_iommu, phb);
+
+    /* Add a single Root port */
+    qdev_prop_set_uint8(DEVICE(&phb->root), "chassis", phb->chip_id);
+    qdev_prop_set_uint16(DEVICE(&phb->root), "slot", phb->phb_id);
+    qdev_set_parent_bus(DEVICE(&phb->root), BUS(pci->bus));
+    qdev_init_nofail(DEVICE(&phb->root));
+
+    /* Setup XIVE Source */
+    if (phb->big_phb) {
+        nr_irqs = PNV_PHB4_MAX_INTs;
+    } else {
+        nr_irqs = PNV_PHB4_MAX_INTs >> 1;
+    }
+    object_property_set_int(OBJECT(xsrc), nr_irqs, "nr-irqs", &error_fatal);
+    object_property_set_link(OBJECT(xsrc), OBJECT(phb), "xive", &error_fatal);
+    object_property_set_bool(OBJECT(xsrc), true, "realized", &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    pnv_phb4_update_xsrc(phb);
+
+    phb->qirqs = qemu_allocate_irqs(xive_source_set_irq, xsrc, xsrc->nr_irqs);
+}
+
+static void pnv_phb4_reset(DeviceState *dev)
+{
+    PnvPHB4 *phb = PNV_PHB4(dev);
+    PCIDevice *root_dev = PCI_DEVICE(&phb->root);
+
+    /*
+     * Configure PCI device id at reset using a property.
+     */
+    pci_config_set_vendor_id(root_dev->config, PCI_VENDOR_ID_IBM);
+    pci_config_set_device_id(root_dev->config, phb->device_id);
+}
+
+static const char *pnv_phb4_root_bus_path(PCIHostState *host_bridge,
+                                          PCIBus *rootbus)
+{
+    PnvPHB4 *phb = PNV_PHB4(host_bridge);
+
+    snprintf(phb->bus_path, sizeof(phb->bus_path), "00%02x:%02x",
+             phb->chip_id, phb->phb_id);
+    return phb->bus_path;
+}
+
+static void pnv_phb4_xive_notify(XiveNotifier *xf, uint32_t srcno)
+{
+    PnvPHB4 *phb = PNV_PHB4(xf);
+    uint64_t notif_port = phb->regs[PHB_INT_NOTIFY_ADDR >> 3];
+    uint32_t offset = phb->regs[PHB_INT_NOTIFY_INDEX >> 3];
+    uint64_t data = XIVE_TRIGGER_PQ | offset | srcno;
+    MemTxResult result;
+
+    address_space_stq_be(&address_space_memory, notif_port, data,
+                         MEMTXATTRS_UNSPECIFIED, &result);
+    if (result != MEMTX_OK) {
+        phb_error(phb, "trigger failed @%"HWADDR_PRIx "\n", notif_port);
+        return;
+    }
+}
+
+static Property pnv_phb4_properties[] = {
+        DEFINE_PROP_UINT32("index", PnvPHB4, phb_id, 0),
+        DEFINE_PROP_UINT32("chip-id", PnvPHB4, chip_id, 0),
+        DEFINE_PROP_UINT64("version", PnvPHB4, version, 0),
+        DEFINE_PROP_UINT16("device-id", PnvPHB4, device_id, 0),
+        DEFINE_PROP_LINK("stack", PnvPHB4, stack, TYPE_PNV_PHB4_PEC_STACK,
+                         PnvPhb4PecStack *),
+        DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pnv_phb4_class_init(ObjectClass *klass, void *data)
+{
+    PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    XiveNotifierClass *xfc = XIVE_NOTIFIER_CLASS(klass);
+
+    hc->root_bus_path   = pnv_phb4_root_bus_path;
+    dc->realize         = pnv_phb4_realize;
+    device_class_set_props(dc, pnv_phb4_properties);
+    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+    dc->user_creatable  = false;
+    dc->reset           = pnv_phb4_reset;
+
+    xfc->notify         = pnv_phb4_xive_notify;
+}
+
+static const TypeInfo pnv_phb4_type_info = {
+    .name          = TYPE_PNV_PHB4,
+    .parent        = TYPE_PCIE_HOST_BRIDGE,
+    .instance_init = pnv_phb4_instance_init,
+    .instance_size = sizeof(PnvPHB4),
+    .class_init    = pnv_phb4_class_init,
+    .interfaces = (InterfaceInfo[]) {
+            { TYPE_XIVE_NOTIFIER },
+            { },
+    }
+};
+
+static void pnv_phb4_root_bus_class_init(ObjectClass *klass, void *data)
+{
+    BusClass *k = BUS_CLASS(klass);
+
+    /*
+     * PHB4 has only a single root complex. Enforce the limit on the
+     * parent bus
+     */
+    k->max_dev = 1;
+}
+
+static const TypeInfo pnv_phb4_root_bus_info = {
+    .name = TYPE_PNV_PHB4_ROOT_BUS,
+    .parent = TYPE_PCIE_BUS,
+    .class_init = pnv_phb4_root_bus_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_PCIE_DEVICE },
+        { }
+    },
+};
+
+static void pnv_phb4_root_port_reset(DeviceState *dev)
+{
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(dev);
+    PCIDevice *d = PCI_DEVICE(dev);
+    uint8_t *conf = d->config;
+
+    rpc->parent_reset(dev);
+
+    pci_byte_test_and_set_mask(conf + PCI_IO_BASE,
+                               PCI_IO_RANGE_MASK & 0xff);
+    pci_byte_test_and_clear_mask(conf + PCI_IO_LIMIT,
+                                 PCI_IO_RANGE_MASK & 0xff);
+    pci_set_word(conf + PCI_MEMORY_BASE, 0);
+    pci_set_word(conf + PCI_MEMORY_LIMIT, 0xfff0);
+    pci_set_word(conf + PCI_PREF_MEMORY_BASE, 0x1);
+    pci_set_word(conf + PCI_PREF_MEMORY_LIMIT, 0xfff1);
+    pci_set_long(conf + PCI_PREF_BASE_UPPER32, 0x1); /* Hack */
+    pci_set_long(conf + PCI_PREF_LIMIT_UPPER32, 0xffffffff);
+}
+
+static void pnv_phb4_root_port_realize(DeviceState *dev, Error **errp)
+{
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(dev);
+    Error *local_err = NULL;
+
+    rpc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void pnv_phb4_root_port_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+    PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(klass);
+
+    dc->desc     = "IBM PHB4 PCIE Root Port";
+    dc->user_creatable = false;
+
+    device_class_set_parent_realize(dc, pnv_phb4_root_port_realize,
+                                    &rpc->parent_realize);
+    device_class_set_parent_reset(dc, pnv_phb4_root_port_reset,
+                                  &rpc->parent_reset);
+
+    k->vendor_id = PCI_VENDOR_ID_IBM;
+    k->device_id = PNV_PHB4_DEVICE_ID;
+    k->revision  = 0;
+
+    rpc->exp_offset = 0x48;
+    rpc->aer_offset = 0x100;
+
+    dc->reset = &pnv_phb4_root_port_reset;
+}
+
+static const TypeInfo pnv_phb4_root_port_info = {
+    .name          = TYPE_PNV_PHB4_ROOT_PORT,
+    .parent        = TYPE_PCIE_ROOT_PORT,
+    .instance_size = sizeof(PnvPHB4RootPort),
+    .class_init    = pnv_phb4_root_port_class_init,
+};
+
+static void pnv_phb4_register_types(void)
+{
+    type_register_static(&pnv_phb4_root_bus_info);
+    type_register_static(&pnv_phb4_root_port_info);
+    type_register_static(&pnv_phb4_type_info);
+    type_register_static(&pnv_phb4_iommu_memory_region_info);
+}
+
+type_init(pnv_phb4_register_types);
+
+void pnv_phb4_update_regions(PnvPhb4PecStack *stack)
+{
+    PnvPHB4 *phb = &stack->phb;
+
+    /* Unmap first always */
+    if (memory_region_is_mapped(&phb->mr_regs)) {
+        memory_region_del_subregion(&stack->phbbar, &phb->mr_regs);
+    }
+    if (memory_region_is_mapped(&phb->xsrc.esb_mmio)) {
+        memory_region_del_subregion(&stack->intbar, &phb->xsrc.esb_mmio);
+    }
+
+    /* Map registers if enabled */
+    if (memory_region_is_mapped(&stack->phbbar)) {
+        memory_region_add_subregion(&stack->phbbar, 0, &phb->mr_regs);
+    }
+
+    /* Map ESB if enabled */
+    if (memory_region_is_mapped(&stack->intbar)) {
+        memory_region_add_subregion(&stack->intbar, 0, &phb->xsrc.esb_mmio);
+    }
+
+    /* Check/update m32 */
+    pnv_phb4_check_all_mbt(phb);
+}
+
+void pnv_phb4_pic_print_info(PnvPHB4 *phb, Monitor *mon)
+{
+    uint32_t offset = phb->regs[PHB_INT_NOTIFY_INDEX >> 3];
+
+    monitor_printf(mon, "PHB4[%x:%x] Source %08x .. %08x\n",
+                   phb->chip_id, phb->phb_id,
+                   offset, offset + phb->xsrc.nr_irqs - 1);
+    xive_source_pic_print_info(&phb->xsrc, 0, mon);
+}
diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
new file mode 100644
index 0000000..68e1db3
--- /dev/null
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -0,0 +1,595 @@
+/*
+ * QEMU PowerPC PowerNV (POWER9) PHB4 model
+ *
+ * Copyright (c) 2018-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/log.h"
+#include "target/ppc/cpu.h"
+#include "hw/ppc/fdt.h"
+#include "hw/pci-host/pnv_phb4_regs.h"
+#include "hw/pci-host/pnv_phb4.h"
+#include "hw/ppc/pnv_xscom.h"
+#include "hw/pci/pci_bridge.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/ppc/pnv.h"
+#include "hw/qdev-properties.h"
+
+#include <libfdt.h>
+
+#define phb_pec_error(pec, fmt, ...)                                    \
+    qemu_log_mask(LOG_GUEST_ERROR, "phb4_pec[%d:%d]: " fmt "\n",        \
+                  (pec)->chip_id, (pec)->index, ## __VA_ARGS__)
+
+
+static uint64_t pnv_pec_nest_xscom_read(void *opaque, hwaddr addr,
+                                        unsigned size)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(opaque);
+    uint32_t reg = addr >> 3;
+
+    /* TODO: add list of allowed registers and error out if not */
+    return pec->nest_regs[reg];
+}
+
+static void pnv_pec_nest_xscom_write(void *opaque, hwaddr addr,
+                                     uint64_t val, unsigned size)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PEC_NEST_PBCQ_HW_CONFIG:
+    case PEC_NEST_DROP_PRIO_CTRL:
+    case PEC_NEST_PBCQ_ERR_INJECT:
+    case PEC_NEST_PCI_NEST_CLK_TRACE_CTL:
+    case PEC_NEST_PBCQ_PMON_CTRL:
+    case PEC_NEST_PBCQ_PBUS_ADDR_EXT:
+    case PEC_NEST_PBCQ_PRED_VEC_TIMEOUT:
+    case PEC_NEST_CAPP_CTRL:
+    case PEC_NEST_PBCQ_READ_STK_OVR:
+    case PEC_NEST_PBCQ_WRITE_STK_OVR:
+    case PEC_NEST_PBCQ_STORE_STK_OVR:
+    case PEC_NEST_PBCQ_RETRY_BKOFF_CTRL:
+        pec->nest_regs[reg] = val;
+        break;
+    default:
+        phb_pec_error(pec, "%s @0x%"HWADDR_PRIx"=%"PRIx64"\n", __func__,
+                      addr, val);
+    }
+}
+
+static const MemoryRegionOps pnv_pec_nest_xscom_ops = {
+    .read = pnv_pec_nest_xscom_read,
+    .write = pnv_pec_nest_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static uint64_t pnv_pec_pci_xscom_read(void *opaque, hwaddr addr,
+                                       unsigned size)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(opaque);
+    uint32_t reg = addr >> 3;
+
+    /* TODO: add list of allowed registers and error out if not */
+    return pec->pci_regs[reg];
+}
+
+static void pnv_pec_pci_xscom_write(void *opaque, hwaddr addr,
+                                    uint64_t val, unsigned size)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PEC_PCI_PBAIB_HW_CONFIG:
+    case PEC_PCI_PBAIB_READ_STK_OVR:
+        pec->pci_regs[reg] = val;
+        break;
+    default:
+        phb_pec_error(pec, "%s @0x%"HWADDR_PRIx"=%"PRIx64"\n", __func__,
+                      addr, val);
+    }
+}
+
+static const MemoryRegionOps pnv_pec_pci_xscom_ops = {
+    .read = pnv_pec_pci_xscom_read,
+    .write = pnv_pec_pci_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static uint64_t pnv_pec_stk_nest_xscom_read(void *opaque, hwaddr addr,
+                                            unsigned size)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(opaque);
+    uint32_t reg = addr >> 3;
+
+    /* TODO: add list of allowed registers and error out if not */
+    return stack->nest_regs[reg];
+}
+
+static void pnv_pec_stk_update_map(PnvPhb4PecStack *stack)
+{
+    PnvPhb4PecState *pec = stack->pec;
+    MemoryRegion *sysmem = pec->system_memory;
+    uint64_t bar_en = stack->nest_regs[PEC_NEST_STK_BAR_EN];
+    uint64_t bar, mask, size;
+    char name[64];
+
+    /*
+     * NOTE: This will really not work well if those are remapped
+     * after the PHB has created its sub regions. We could do better
+     * if we had a way to resize regions but we don't really care
+     * that much in practice as the stuff below really only happens
+     * once early during boot
+     */
+
+    /* Handle unmaps */
+    if (memory_region_is_mapped(&stack->mmbar0) &&
+        !(bar_en & PEC_NEST_STK_BAR_EN_MMIO0)) {
+        memory_region_del_subregion(sysmem, &stack->mmbar0);
+    }
+    if (memory_region_is_mapped(&stack->mmbar1) &&
+        !(bar_en & PEC_NEST_STK_BAR_EN_MMIO1)) {
+        memory_region_del_subregion(sysmem, &stack->mmbar1);
+    }
+    if (memory_region_is_mapped(&stack->phbbar) &&
+        !(bar_en & PEC_NEST_STK_BAR_EN_PHB)) {
+        memory_region_del_subregion(sysmem, &stack->phbbar);
+    }
+    if (memory_region_is_mapped(&stack->intbar) &&
+        !(bar_en & PEC_NEST_STK_BAR_EN_INT)) {
+        memory_region_del_subregion(sysmem, &stack->intbar);
+    }
+
+    /* Update PHB */
+    pnv_phb4_update_regions(stack);
+
+    /* Handle maps */
+    if (!memory_region_is_mapped(&stack->mmbar0) &&
+        (bar_en & PEC_NEST_STK_BAR_EN_MMIO0)) {
+        bar = stack->nest_regs[PEC_NEST_STK_MMIO_BAR0] >> 8;
+        mask = stack->nest_regs[PEC_NEST_STK_MMIO_BAR0_MASK];
+        size = ((~mask) >> 8) + 1;
+        snprintf(name, sizeof(name), "pec-%d.%d-stack-%d-mmio0",
+                 pec->chip_id, pec->index, stack->stack_no);
+        memory_region_init(&stack->mmbar0, OBJECT(stack), name, size);
+        memory_region_add_subregion(sysmem, bar, &stack->mmbar0);
+        stack->mmio0_base = bar;
+        stack->mmio0_size = size;
+    }
+    if (!memory_region_is_mapped(&stack->mmbar1) &&
+        (bar_en & PEC_NEST_STK_BAR_EN_MMIO1)) {
+        bar = stack->nest_regs[PEC_NEST_STK_MMIO_BAR1] >> 8;
+        mask = stack->nest_regs[PEC_NEST_STK_MMIO_BAR1_MASK];
+        size = ((~mask) >> 8) + 1;
+        snprintf(name, sizeof(name), "pec-%d.%d-stack-%d-mmio1",
+                 pec->chip_id, pec->index, stack->stack_no);
+        memory_region_init(&stack->mmbar1, OBJECT(stack), name, size);
+        memory_region_add_subregion(sysmem, bar, &stack->mmbar1);
+        stack->mmio1_base = bar;
+        stack->mmio1_size = size;
+    }
+    if (!memory_region_is_mapped(&stack->phbbar) &&
+        (bar_en & PEC_NEST_STK_BAR_EN_PHB)) {
+        bar = stack->nest_regs[PEC_NEST_STK_PHB_REGS_BAR] >> 8;
+        size = PNV_PHB4_NUM_REGS << 3;
+        snprintf(name, sizeof(name), "pec-%d.%d-stack-%d-phb",
+                 pec->chip_id, pec->index, stack->stack_no);
+        memory_region_init(&stack->phbbar, OBJECT(stack), name, size);
+        memory_region_add_subregion(sysmem, bar, &stack->phbbar);
+    }
+    if (!memory_region_is_mapped(&stack->intbar) &&
+        (bar_en & PEC_NEST_STK_BAR_EN_INT)) {
+        bar = stack->nest_regs[PEC_NEST_STK_INT_BAR] >> 8;
+        size = PNV_PHB4_MAX_INTs << 16;
+        snprintf(name, sizeof(name), "pec-%d.%d-stack-%d-int",
+                 stack->pec->chip_id, stack->pec->index, stack->stack_no);
+        memory_region_init(&stack->intbar, OBJECT(stack), name, size);
+        memory_region_add_subregion(sysmem, bar, &stack->intbar);
+    }
+
+    /* Update PHB */
+    pnv_phb4_update_regions(stack);
+}
+
+static void pnv_pec_stk_nest_xscom_write(void *opaque, hwaddr addr,
+                                         uint64_t val, unsigned size)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(opaque);
+    PnvPhb4PecState *pec = stack->pec;
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PEC_NEST_STK_PCI_NEST_FIR:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR] = val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_CLR:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR] &= val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_SET:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR] |= val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_MSK:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR_MSK] = val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_MSKC:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR_MSK] &= val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_MSKS:
+        stack->nest_regs[PEC_NEST_STK_PCI_NEST_FIR_MSK] |= val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_ACT0:
+    case PEC_NEST_STK_PCI_NEST_FIR_ACT1:
+        stack->nest_regs[reg] = val;
+        break;
+    case PEC_NEST_STK_PCI_NEST_FIR_WOF:
+        stack->nest_regs[reg] = 0;
+        break;
+    case PEC_NEST_STK_ERR_REPORT_0:
+    case PEC_NEST_STK_ERR_REPORT_1:
+    case PEC_NEST_STK_PBCQ_GNRL_STATUS:
+        /* Flag error ? */
+        break;
+    case PEC_NEST_STK_PBCQ_MODE:
+        stack->nest_regs[reg] = val & 0xff00000000000000ull;
+        break;
+    case PEC_NEST_STK_MMIO_BAR0:
+    case PEC_NEST_STK_MMIO_BAR0_MASK:
+    case PEC_NEST_STK_MMIO_BAR1:
+    case PEC_NEST_STK_MMIO_BAR1_MASK:
+        if (stack->nest_regs[PEC_NEST_STK_BAR_EN] &
+            (PEC_NEST_STK_BAR_EN_MMIO0 |
+             PEC_NEST_STK_BAR_EN_MMIO1)) {
+            phb_pec_error(pec, "Changing enabled BAR unsupported\n");
+        }
+        stack->nest_regs[reg] = val & 0xffffffffff000000ull;
+        break;
+    case PEC_NEST_STK_PHB_REGS_BAR:
+        if (stack->nest_regs[PEC_NEST_STK_BAR_EN] & PEC_NEST_STK_BAR_EN_PHB) {
+            phb_pec_error(pec, "Changing enabled BAR unsupported\n");
+        }
+        stack->nest_regs[reg] = val & 0xffffffffffc00000ull;
+        break;
+    case PEC_NEST_STK_INT_BAR:
+        if (stack->nest_regs[PEC_NEST_STK_BAR_EN] & PEC_NEST_STK_BAR_EN_INT) {
+            phb_pec_error(pec, "Changing enabled BAR unsupported\n");
+        }
+        stack->nest_regs[reg] = val & 0xfffffff000000000ull;
+        break;
+    case PEC_NEST_STK_BAR_EN:
+        stack->nest_regs[reg] = val & 0xf000000000000000ull;
+        pnv_pec_stk_update_map(stack);
+        break;
+    case PEC_NEST_STK_DATA_FRZ_TYPE:
+    case PEC_NEST_STK_PBCQ_TUN_BAR:
+        /* Not used for now */
+        stack->nest_regs[reg] = val;
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4_pec: nest_xscom_write 0x%"HWADDR_PRIx
+                      "=%"PRIx64"\n", addr, val);
+    }
+}
+
+static const MemoryRegionOps pnv_pec_stk_nest_xscom_ops = {
+    .read = pnv_pec_stk_nest_xscom_read,
+    .write = pnv_pec_stk_nest_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static uint64_t pnv_pec_stk_pci_xscom_read(void *opaque, hwaddr addr,
+                                           unsigned size)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(opaque);
+    uint32_t reg = addr >> 3;
+
+    /* TODO: add list of allowed registers and error out if not */
+    return stack->pci_regs[reg];
+}
+
+static void pnv_pec_stk_pci_xscom_write(void *opaque, hwaddr addr,
+                                        uint64_t val, unsigned size)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(opaque);
+    uint32_t reg = addr >> 3;
+
+    switch (reg) {
+    case PEC_PCI_STK_PCI_FIR:
+        stack->nest_regs[reg] = val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_CLR:
+        stack->nest_regs[PEC_PCI_STK_PCI_FIR] &= val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_SET:
+        stack->nest_regs[PEC_PCI_STK_PCI_FIR] |= val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_MSK:
+        stack->nest_regs[reg] = val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_MSKC:
+        stack->nest_regs[PEC_PCI_STK_PCI_FIR_MSK] &= val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_MSKS:
+        stack->nest_regs[PEC_PCI_STK_PCI_FIR_MSK] |= val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_ACT0:
+    case PEC_PCI_STK_PCI_FIR_ACT1:
+        stack->nest_regs[reg] = val;
+        break;
+    case PEC_PCI_STK_PCI_FIR_WOF:
+        stack->nest_regs[reg] = 0;
+        break;
+    case PEC_PCI_STK_ETU_RESET:
+        stack->nest_regs[reg] = val & 0x8000000000000000ull;
+        /* TODO: Implement reset */
+        break;
+    case PEC_PCI_STK_PBAIB_ERR_REPORT:
+        break;
+    case PEC_PCI_STK_PBAIB_TX_CMD_CRED:
+    case PEC_PCI_STK_PBAIB_TX_DAT_CRED:
+        stack->nest_regs[reg] = val;
+        break;
+    default:
+        qemu_log_mask(LOG_UNIMP, "phb4_pec_stk: pci_xscom_write 0x%"HWADDR_PRIx
+                      "=%"PRIx64"\n", addr, val);
+    }
+}
+
+static const MemoryRegionOps pnv_pec_stk_pci_xscom_ops = {
+    .read = pnv_pec_stk_pci_xscom_read,
+    .write = pnv_pec_stk_pci_xscom_write,
+    .valid.min_access_size = 8,
+    .valid.max_access_size = 8,
+    .impl.min_access_size = 8,
+    .impl.max_access_size = 8,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static void pnv_pec_instance_init(Object *obj)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(obj);
+    int i;
+
+    for (i = 0; i < PHB4_PEC_MAX_STACKS; i++) {
+        object_initialize_child(obj, "stack[*]", &pec->stacks[i],
+                                sizeof(pec->stacks[i]), TYPE_PNV_PHB4_PEC_STACK,
+                                &error_abort, NULL);
+    }
+}
+
+static void pnv_pec_realize(DeviceState *dev, Error **errp)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(dev);
+    Error *local_err = NULL;
+    char name[64];
+    int i;
+
+    assert(pec->system_memory);
+
+    /* Create stacks */
+    for (i = 0; i < pec->num_stacks; i++) {
+        PnvPhb4PecStack *stack = &pec->stacks[i];
+        Object *stk_obj = OBJECT(stack);
+
+        object_property_set_int(stk_obj, i, "stack-no", &error_abort);
+        object_property_set_link(stk_obj, OBJECT(pec), "pec", &error_abort);
+        object_property_set_bool(stk_obj, true, "realized", errp);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+    /* Initialize the XSCOM regions for the PEC registers */
+    snprintf(name, sizeof(name), "xscom-pec-%d.%d-nest", pec->chip_id,
+             pec->index);
+    pnv_xscom_region_init(&pec->nest_regs_mr, OBJECT(dev),
+                          &pnv_pec_nest_xscom_ops, pec, name,
+                          PHB4_PEC_NEST_REGS_COUNT);
+
+    snprintf(name, sizeof(name), "xscom-pec-%d.%d-pci", pec->chip_id,
+             pec->index);
+    pnv_xscom_region_init(&pec->pci_regs_mr, OBJECT(dev),
+                          &pnv_pec_pci_xscom_ops, pec, name,
+                          PHB4_PEC_PCI_REGS_COUNT);
+}
+
+static int pnv_pec_dt_xscom(PnvXScomInterface *dev, void *fdt,
+                            int xscom_offset)
+{
+    PnvPhb4PecState *pec = PNV_PHB4_PEC(dev);
+    PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(dev);
+    uint32_t nbase = pecc->xscom_nest_base(pec);
+    uint32_t pbase = pecc->xscom_pci_base(pec);
+    int offset, i;
+    char *name;
+    uint32_t reg[] = {
+        cpu_to_be32(nbase),
+        cpu_to_be32(pecc->xscom_nest_size),
+        cpu_to_be32(pbase),
+        cpu_to_be32(pecc->xscom_pci_size),
+    };
+
+    name = g_strdup_printf("pbcq@%x", nbase);
+    offset = fdt_add_subnode(fdt, xscom_offset, name);
+    _FDT(offset);
+    g_free(name);
+
+    _FDT((fdt_setprop(fdt, offset, "reg", reg, sizeof(reg))));
+
+    _FDT((fdt_setprop_cell(fdt, offset, "ibm,pec-index", pec->index)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 1)));
+    _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0)));
+    _FDT((fdt_setprop(fdt, offset, "compatible", pecc->compat,
+                      pecc->compat_size)));
+
+    for (i = 0; i < pec->num_stacks; i++) {
+        PnvPhb4PecStack *stack = &pec->stacks[i];
+        PnvPHB4 *phb = &stack->phb;
+        int stk_offset;
+
+        name = g_strdup_printf("stack@%x", i);
+        stk_offset = fdt_add_subnode(fdt, offset, name);
+        _FDT(stk_offset);
+        g_free(name);
+        _FDT((fdt_setprop(fdt, stk_offset, "compatible", pecc->stk_compat,
+                          pecc->stk_compat_size)));
+        _FDT((fdt_setprop_cell(fdt, stk_offset, "reg", i)));
+        _FDT((fdt_setprop_cell(fdt, stk_offset, "ibm,phb-index", phb->phb_id)));
+    }
+
+    return 0;
+}
+
+static Property pnv_pec_properties[] = {
+        DEFINE_PROP_UINT32("index", PnvPhb4PecState, index, 0),
+        DEFINE_PROP_UINT32("num-stacks", PnvPhb4PecState, num_stacks, 0),
+        DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
+        DEFINE_PROP_LINK("system-memory", PnvPhb4PecState, system_memory,
+                     TYPE_MEMORY_REGION, MemoryRegion *),
+        DEFINE_PROP_END_OF_LIST(),
+};
+
+static uint32_t pnv_pec_xscom_pci_base(PnvPhb4PecState *pec)
+{
+    return PNV9_XSCOM_PEC_PCI_BASE + 0x1000000 * pec->index;
+}
+
+static uint32_t pnv_pec_xscom_nest_base(PnvPhb4PecState *pec)
+{
+    return PNV9_XSCOM_PEC_NEST_BASE + 0x400 * pec->index;
+}
+
+static void pnv_pec_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PnvXScomInterfaceClass *xdc = PNV_XSCOM_INTERFACE_CLASS(klass);
+    PnvPhb4PecClass *pecc = PNV_PHB4_PEC_CLASS(klass);
+    static const char compat[] = "ibm,power9-pbcq";
+    static const char stk_compat[] = "ibm,power9-phb-stack";
+
+    xdc->dt_xscom = pnv_pec_dt_xscom;
+
+    dc->realize = pnv_pec_realize;
+    device_class_set_props(dc, pnv_pec_properties);
+    dc->user_creatable = false;
+
+    pecc->xscom_nest_base = pnv_pec_xscom_nest_base;
+    pecc->xscom_pci_base  = pnv_pec_xscom_pci_base;
+    pecc->xscom_nest_size = PNV9_XSCOM_PEC_NEST_SIZE;
+    pecc->xscom_pci_size  = PNV9_XSCOM_PEC_PCI_SIZE;
+    pecc->compat = compat;
+    pecc->compat_size = sizeof(compat);
+    pecc->stk_compat = stk_compat;
+    pecc->stk_compat_size = sizeof(stk_compat);
+}
+
+static const TypeInfo pnv_pec_type_info = {
+    .name          = TYPE_PNV_PHB4_PEC,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(PnvPhb4PecState),
+    .instance_init = pnv_pec_instance_init,
+    .class_init    = pnv_pec_class_init,
+    .class_size    = sizeof(PnvPhb4PecClass),
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_PNV_XSCOM_INTERFACE },
+        { }
+    }
+};
+
+static void pnv_pec_stk_instance_init(Object *obj)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(obj);
+
+    object_initialize_child(obj, "phb", &stack->phb, sizeof(stack->phb),
+                            TYPE_PNV_PHB4, &error_abort, NULL);
+}
+
+static void pnv_pec_stk_realize(DeviceState *dev, Error **errp)
+{
+    PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(dev);
+    PnvPhb4PecState *pec = stack->pec;
+    char name[64];
+
+    assert(pec);
+
+    /* Initialize the XSCOM regions for the stack registers */
+    snprintf(name, sizeof(name), "xscom-pec-%d.%d-nest-stack-%d",
+             pec->chip_id, pec->index, stack->stack_no);
+    pnv_xscom_region_init(&stack->nest_regs_mr, OBJECT(stack),
+                          &pnv_pec_stk_nest_xscom_ops, stack, name,
+                          PHB4_PEC_NEST_STK_REGS_COUNT);
+
+    snprintf(name, sizeof(name), "xscom-pec-%d.%d-pci-stack-%d",
+             pec->chip_id, pec->index, stack->stack_no);
+    pnv_xscom_region_init(&stack->pci_regs_mr, OBJECT(stack),
+                          &pnv_pec_stk_pci_xscom_ops, stack, name,
+                          PHB4_PEC_PCI_STK_REGS_COUNT);
+
+    /* PHB pass-through */
+    snprintf(name, sizeof(name), "xscom-pec-%d.%d-pci-stack-%d-phb",
+             pec->chip_id, pec->index, stack->stack_no);
+    pnv_xscom_region_init(&stack->phb_regs_mr, OBJECT(&stack->phb),
+                          &pnv_phb4_xscom_ops, &stack->phb, name, 0x40);
+
+    /*
+     * Let the machine/chip realize the PHB object to customize more
+     * easily some fields
+     */
+}
+
+static Property pnv_pec_stk_properties[] = {
+        DEFINE_PROP_UINT32("stack-no", PnvPhb4PecStack, stack_no, 0),
+        DEFINE_PROP_LINK("pec", PnvPhb4PecStack, pec, TYPE_PNV_PHB4_PEC,
+                         PnvPhb4PecState *),
+        DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pnv_pec_stk_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    device_class_set_props(dc, pnv_pec_stk_properties);
+    dc->realize = pnv_pec_stk_realize;
+    dc->user_creatable = false;
+
+    /* TODO: reset regs ? */
+}
+
+static const TypeInfo pnv_pec_stk_type_info = {
+    .name          = TYPE_PNV_PHB4_PEC_STACK,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(PnvPhb4PecStack),
+    .instance_init = pnv_pec_stk_instance_init,
+    .class_init    = pnv_pec_stk_class_init,
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_PNV_XSCOM_INTERFACE },
+        { }
+    }
+};
+
+static void pnv_pec_register_types(void)
+{
+    type_register_static(&pnv_pec_type_info);
+    type_register_static(&pnv_pec_stk_type_info);
+}
+
+type_init(pnv_pec_register_types);
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index e27efe9..354828b 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -135,6 +135,8 @@
     default y
     depends on PSERIES
     select XIVE
+    select PCI
+    select PCIE_PORT
 
 config XIVE_KVM
     bool
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index e61994c..139c857 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -40,6 +40,7 @@
 #include "hw/intc/intc.h"
 #include "hw/ipmi/ipmi.h"
 #include "target/ppc/mmu-hash64.h"
+#include "hw/pci/msi.h"
 
 #include "hw/ppc/xics.h"
 #include "hw/qdev-properties.h"
@@ -615,16 +616,29 @@
 static void pnv_chip_power8_pic_print_info(PnvChip *chip, Monitor *mon)
 {
     Pnv8Chip *chip8 = PNV8_CHIP(chip);
+    int i;
 
     ics_pic_print_info(&chip8->psi.ics, mon);
+    for (i = 0; i < chip->num_phbs; i++) {
+        pnv_phb3_msi_pic_print_info(&chip8->phbs[i].msis, mon);
+        ics_pic_print_info(&chip8->phbs[i].lsis, mon);
+    }
 }
 
 static void pnv_chip_power9_pic_print_info(PnvChip *chip, Monitor *mon)
 {
     Pnv9Chip *chip9 = PNV9_CHIP(chip);
+    int i, j;
 
     pnv_xive_pic_print_info(&chip9->xive, mon);
     pnv_psi_pic_print_info(&chip9->psi, mon);
+
+    for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {
+        PnvPhb4PecState *pec = &chip9->pecs[i];
+        for (j = 0; j < pec->num_stacks; j++) {
+            pnv_phb4_pic_print_info(&pec->stacks[j].phb, mon);
+        }
+    }
 }
 
 static uint64_t pnv_chip_power8_xscom_core_base(PnvChip *chip,
@@ -716,7 +730,7 @@
         exit(1);
     }
 
-    fw_size = load_image_targphys(fw_filename, FW_LOAD_ADDR, FW_MAX_SIZE);
+    fw_size = load_image_targphys(fw_filename, pnv->fw_load_addr, FW_MAX_SIZE);
     if (fw_size < 0) {
         error_report("Could not load OPAL firmware '%s'", fw_filename);
         exit(1);
@@ -748,6 +762,9 @@
         }
     }
 
+    /* MSIs are supported on this platform */
+    msi_nonbroken = true;
+
     /*
      * Check compatibility of the specified CPU with the machine
      * default.
@@ -1014,7 +1031,10 @@
 
 static void pnv_chip_power8_instance_init(Object *obj)
 {
+    PnvChip *chip = PNV_CHIP(obj);
     Pnv8Chip *chip8 = PNV8_CHIP(obj);
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(obj);
+    int i;
 
     object_property_add_link(obj, "xics", TYPE_XICS_FABRIC,
                              (Object **)&chip8->xics,
@@ -1033,6 +1053,17 @@
 
     object_initialize_child(obj, "homer",  &chip8->homer, sizeof(chip8->homer),
                             TYPE_PNV8_HOMER, &error_abort, NULL);
+
+    for (i = 0; i < pcc->num_phbs; i++) {
+        object_initialize_child(obj, "phb[*]", &chip8->phbs[i],
+                                sizeof(chip8->phbs[i]), TYPE_PNV_PHB3,
+                                &error_abort, NULL);
+    }
+
+    /*
+     * Number of PHBs is the chip default
+     */
+    chip->num_phbs = pcc->num_phbs;
 }
 
 static void pnv_chip_icp_realize(Pnv8Chip *chip8, Error **errp)
@@ -1071,6 +1102,7 @@
     Pnv8Chip *chip8 = PNV8_CHIP(dev);
     Pnv8Psi *psi8 = &chip8->psi;
     Error *local_err = NULL;
+    int i;
 
     assert(chip8->xics);
 
@@ -1151,6 +1183,33 @@
     /* Homer mmio region */
     memory_region_add_subregion(get_system_memory(), PNV_HOMER_BASE(chip),
                                 &chip8->homer.regs);
+
+    /* PHB3 controllers */
+    for (i = 0; i < chip->num_phbs; i++) {
+        PnvPHB3 *phb = &chip8->phbs[i];
+        PnvPBCQState *pbcq = &phb->pbcq;
+
+        object_property_set_int(OBJECT(phb), i, "index", &error_fatal);
+        object_property_set_int(OBJECT(phb), chip->chip_id, "chip-id",
+                                &error_fatal);
+        object_property_set_bool(OBJECT(phb), true, "realized", &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        qdev_set_parent_bus(DEVICE(phb), sysbus_get_default());
+
+        /* Populate the XSCOM address space. */
+        pnv_xscom_add_subregion(chip,
+                                PNV_XSCOM_PBCQ_NEST_BASE + 0x400 * phb->phb_id,
+                                &pbcq->xscom_nest_regs);
+        pnv_xscom_add_subregion(chip,
+                                PNV_XSCOM_PBCQ_PCI_BASE + 0x400 * phb->phb_id,
+                                &pbcq->xscom_pci_regs);
+        pnv_xscom_add_subregion(chip,
+                                PNV_XSCOM_PBCQ_SPCI_BASE + 0x040 * phb->phb_id,
+                                &pbcq->xscom_spci_regs);
+    }
 }
 
 static uint32_t pnv_chip_power8_xscom_pcba(PnvChip *chip, uint64_t addr)
@@ -1166,6 +1225,7 @@
 
     k->chip_cfam_id = 0x221ef04980000000ull;  /* P8 Murano DD2.1 */
     k->cores_mask = POWER8E_CORE_MASK;
+    k->num_phbs = 3;
     k->core_pir = pnv_chip_core_pir_p8;
     k->intc_create = pnv_chip_power8_intc_create;
     k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1189,6 +1249,7 @@
 
     k->chip_cfam_id = 0x220ea04980000000ull; /* P8 Venice DD2.0 */
     k->cores_mask = POWER8_CORE_MASK;
+    k->num_phbs = 3;
     k->core_pir = pnv_chip_core_pir_p8;
     k->intc_create = pnv_chip_power8_intc_create;
     k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1212,6 +1273,7 @@
 
     k->chip_cfam_id = 0x120d304980000000ull;  /* P8 Naples DD1.0 */
     k->cores_mask = POWER8_CORE_MASK;
+    k->num_phbs = 3;
     k->core_pir = pnv_chip_core_pir_p8;
     k->intc_create = pnv_chip_power8_intc_create;
     k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1230,7 +1292,10 @@
 
 static void pnv_chip_power9_instance_init(Object *obj)
 {
+    PnvChip *chip = PNV_CHIP(obj);
     Pnv9Chip *chip9 = PNV9_CHIP(obj);
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(obj);
+    int i;
 
     object_initialize_child(obj, "xive", &chip9->xive, sizeof(chip9->xive),
                             TYPE_PNV_XIVE, &error_abort, NULL);
@@ -1248,6 +1313,17 @@
 
     object_initialize_child(obj, "homer",  &chip9->homer, sizeof(chip9->homer),
                             TYPE_PNV9_HOMER, &error_abort, NULL);
+
+    for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {
+        object_initialize_child(obj, "pec[*]", &chip9->pecs[i],
+                                sizeof(chip9->pecs[i]), TYPE_PNV_PHB4_PEC,
+                                &error_abort, NULL);
+    }
+
+    /*
+     * Number of PHBs is the chip default
+     */
+    chip->num_phbs = pcc->num_phbs;
 }
 
 static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error **errp)
@@ -1276,6 +1352,78 @@
     }
 }
 
+static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
+{
+    Pnv9Chip *chip9 = PNV9_CHIP(chip);
+    Error *local_err = NULL;
+    int i, j;
+    int phb_id = 0;
+
+    for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {
+        PnvPhb4PecState *pec = &chip9->pecs[i];
+        PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
+        uint32_t pec_nest_base;
+        uint32_t pec_pci_base;
+
+        object_property_set_int(OBJECT(pec), i, "index", &error_fatal);
+        /*
+         * PEC0 -> 1 stack
+         * PEC1 -> 2 stacks
+         * PEC2 -> 3 stacks
+         */
+        object_property_set_int(OBJECT(pec), i + 1, "num-stacks",
+                                &error_fatal);
+        object_property_set_int(OBJECT(pec), chip->chip_id, "chip-id",
+                                 &error_fatal);
+        object_property_set_link(OBJECT(pec), OBJECT(get_system_memory()),
+                                 "system-memory", &error_abort);
+        object_property_set_bool(OBJECT(pec), true, "realized", &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        pec_nest_base = pecc->xscom_nest_base(pec);
+        pec_pci_base = pecc->xscom_pci_base(pec);
+
+        pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);
+        pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
+
+        for (j = 0; j < pec->num_stacks && phb_id < chip->num_phbs;
+             j++, phb_id++) {
+            PnvPhb4PecStack *stack = &pec->stacks[j];
+            Object *obj = OBJECT(&stack->phb);
+
+            object_property_set_int(obj, phb_id, "index", &error_fatal);
+            object_property_set_int(obj, chip->chip_id, "chip-id",
+                                    &error_fatal);
+            object_property_set_int(obj, PNV_PHB4_VERSION, "version",
+                                    &error_fatal);
+            object_property_set_int(obj, PNV_PHB4_DEVICE_ID, "device-id",
+                                    &error_fatal);
+            object_property_set_link(obj, OBJECT(stack), "stack", &error_abort);
+            object_property_set_bool(obj, true, "realized", &local_err);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                return;
+            }
+            qdev_set_parent_bus(DEVICE(obj), sysbus_get_default());
+
+            /* Populate the XSCOM address space. */
+            pnv_xscom_add_subregion(chip,
+                                   pec_nest_base + 0x40 * (stack->stack_no + 1),
+                                   &stack->nest_regs_mr);
+            pnv_xscom_add_subregion(chip,
+                                    pec_pci_base + 0x40 * (stack->stack_no + 1),
+                                    &stack->pci_regs_mr);
+            pnv_xscom_add_subregion(chip,
+                                    pec_pci_base + PNV9_XSCOM_PEC_PCI_STK0 +
+                                    0x40 * stack->stack_no,
+                                    &stack->phb_regs_mr);
+        }
+    }
+}
+
 static void pnv_chip_power9_realize(DeviceState *dev, Error **errp)
 {
     PnvChipClass *pcc = PNV_CHIP_GET_CLASS(dev);
@@ -1378,6 +1526,13 @@
     /* Homer mmio region */
     memory_region_add_subregion(get_system_memory(), PNV9_HOMER_BASE(chip),
                                 &chip9->homer.regs);
+
+    /* PHBs */
+    pnv_chip_power9_phb_realize(chip, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 }
 
 static uint32_t pnv_chip_power9_xscom_pcba(PnvChip *chip, uint64_t addr)
@@ -1404,6 +1559,7 @@
     k->xscom_core_base = pnv_chip_power9_xscom_core_base;
     k->xscom_pcba = pnv_chip_power9_xscom_pcba;
     dc->desc = "PowerNV Chip POWER9";
+    k->num_phbs = 6;
 
     device_class_set_parent_realize(dc, pnv_chip_power9_realize,
                                     &k->parent_realize);
@@ -1533,6 +1689,7 @@
     PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
     const char *typename = pnv_chip_core_typename(chip);
     int i, core_hwid;
+    PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
 
     if (!object_class_by_name(typename)) {
         error_setg(errp, "Unable to find PowerNV CPU Core '%s'", typename);
@@ -1571,6 +1728,8 @@
         object_property_set_int(OBJECT(pnv_core),
                                 pcc->core_pir(chip, core_hwid),
                                 "pir", &error_fatal);
+        object_property_set_int(OBJECT(pnv_core), pnv->fw_load_addr,
+                                "hrmor", &error_fatal);
         object_property_set_link(OBJECT(pnv_core), OBJECT(chip), "chip",
                                  &error_abort);
         object_property_set_bool(OBJECT(pnv_core), true, "realized",
@@ -1605,6 +1764,7 @@
     DEFINE_PROP_UINT32("nr-cores", PnvChip, nr_cores, 1),
     DEFINE_PROP_UINT64("cores-mask", PnvChip, cores_mask, 0x0),
     DEFINE_PROP_UINT32("nr-threads", PnvChip, nr_threads, 1),
+    DEFINE_PROP_UINT32("num-phbs", PnvChip, num_phbs, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1638,14 +1798,23 @@
 static ICSState *pnv_ics_get(XICSFabric *xi, int irq)
 {
     PnvMachineState *pnv = PNV_MACHINE(xi);
-    int i;
+    int i, j;
 
     for (i = 0; i < pnv->num_chips; i++) {
+        PnvChip *chip = pnv->chips[i];
         Pnv8Chip *chip8 = PNV8_CHIP(pnv->chips[i]);
 
         if (ics_valid_irq(&chip8->psi.ics, irq)) {
             return &chip8->psi.ics;
         }
+        for (j = 0; j < chip->num_phbs; j++) {
+            if (ics_valid_irq(&chip8->phbs[j].lsis, irq)) {
+                return &chip8->phbs[j].lsis;
+            }
+            if (ics_valid_irq(ICS(&chip8->phbs[j].msis), irq)) {
+                return ICS(&chip8->phbs[j].msis);
+            }
+        }
     }
     return NULL;
 }
@@ -1653,11 +1822,17 @@
 static void pnv_ics_resend(XICSFabric *xi)
 {
     PnvMachineState *pnv = PNV_MACHINE(xi);
-    int i;
+    int i, j;
 
     for (i = 0; i < pnv->num_chips; i++) {
+        PnvChip *chip = pnv->chips[i];
         Pnv8Chip *chip8 = PNV8_CHIP(pnv->chips[i]);
+
         ics_resend(&chip8->psi.ics);
+        for (j = 0; j < chip->num_phbs; j++) {
+            ics_resend(&chip8->phbs[j].lsis);
+            ics_resend(ICS(&chip8->phbs[j].msis));
+        }
     }
 }
 
@@ -1767,6 +1942,22 @@
     pmc->dt_power_mgt = pnv_dt_power_mgt;
 }
 
+static bool pnv_machine_get_hb(Object *obj, Error **errp)
+{
+    PnvMachineState *pnv = PNV_MACHINE(obj);
+
+    return !!pnv->fw_load_addr;
+}
+
+static void pnv_machine_set_hb(Object *obj, bool value, Error **errp)
+{
+    PnvMachineState *pnv = PNV_MACHINE(obj);
+
+    if (value) {
+        pnv->fw_load_addr = 0x8000000;
+    }
+}
+
 static void pnv_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -1786,6 +1977,13 @@
      */
     mc->default_ram_size = INITRD_LOAD_ADDR + INITRD_MAX_SIZE;
     ispc->print_info = pnv_pic_print_info;
+
+    object_class_property_add_bool(oc, "hb-mode",
+                                   pnv_machine_get_hb, pnv_machine_set_hb,
+                                   &error_abort);
+    object_class_property_set_description(oc, "hb-mode",
+                              "Use a hostboot like boot loader",
+                              NULL);
 }
 
 #define DEFINE_PNV8_CHIP_TYPE(type, class_initfn) \
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 8ca5fbd..2345620 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -40,11 +40,11 @@
     return cpu_type;
 }
 
-static void pnv_core_cpu_reset(PowerPCCPU *cpu, PnvChip *chip)
+static void pnv_core_cpu_reset(PnvCore *pc, PowerPCCPU *cpu)
 {
     CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
-    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(pc->chip);
 
     cpu_reset(cs);
 
@@ -56,7 +56,9 @@
     env->nip = 0x10;
     env->msr |= MSR_HVB; /* Hypervisor mode */
 
-    pcc->intc_reset(chip, cpu);
+    env->spr[SPR_HRMOR] = pc->hrmor;
+
+    pcc->intc_reset(pc->chip, cpu);
 }
 
 /*
@@ -162,14 +164,14 @@
     .endianness = DEVICE_BIG_ENDIAN,
 };
 
-static void pnv_core_cpu_realize(PowerPCCPU *cpu, PnvChip *chip, Error **errp)
+static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp)
 {
     CPUPPCState *env = &cpu->env;
     int core_pir;
     int thread_index = 0; /* TODO: TCG supports only one thread */
     ppc_spr_t *pir = &env->spr_cb[SPR_PIR];
     Error *local_err = NULL;
-    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(pc->chip);
 
     object_property_set_bool(OBJECT(cpu), true, "realized", &local_err);
     if (local_err) {
@@ -177,13 +179,13 @@
         return;
     }
 
-    pcc->intc_create(chip, cpu, &local_err);
+    pcc->intc_create(pc->chip, cpu, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
     }
 
-    core_pir = object_property_get_uint(OBJECT(cpu), "core-pir", &error_abort);
+    core_pir = object_property_get_uint(OBJECT(pc), "pir", &error_abort);
 
     /*
      * The PIR of a thread is the core PIR + the thread index. We will
@@ -203,7 +205,7 @@
     int i;
 
     for (i = 0; i < cc->nr_threads; i++) {
-        pnv_core_cpu_reset(pc->threads[i], pc->chip);
+        pnv_core_cpu_reset(pc, pc->threads[i]);
     }
 }
 
@@ -231,8 +233,6 @@
 
         snprintf(name, sizeof(name), "thread[%d]", i);
         object_property_add_child(OBJECT(pc), name, obj, &error_abort);
-        object_property_add_alias(obj, "core-pir", OBJECT(pc),
-                                  "pir", &error_abort);
 
         cpu->machine_data = g_new0(PnvCPUState, 1);
 
@@ -240,7 +240,7 @@
     }
 
     for (j = 0; j < cc->nr_threads; j++) {
-        pnv_core_cpu_realize(pc->threads[j], pc->chip, &local_err);
+        pnv_core_cpu_realize(pc, pc->threads[j], &local_err);
         if (local_err) {
             goto err;
         }
@@ -263,12 +263,12 @@
     error_propagate(errp, local_err);
 }
 
-static void pnv_core_cpu_unrealize(PowerPCCPU *cpu, PnvChip *chip)
+static void pnv_core_cpu_unrealize(PnvCore *pc, PowerPCCPU *cpu)
 {
     PnvCPUState *pnv_cpu = pnv_cpu_state(cpu);
-    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(chip);
+    PnvChipClass *pcc = PNV_CHIP_GET_CLASS(pc->chip);
 
-    pcc->intc_destroy(chip, cpu);
+    pcc->intc_destroy(pc->chip, cpu);
     cpu_remove_sync(CPU(cpu));
     cpu->machine_data = NULL;
     g_free(pnv_cpu);
@@ -284,13 +284,14 @@
     qemu_unregister_reset(pnv_core_reset, pc);
 
     for (i = 0; i < cc->nr_threads; i++) {
-        pnv_core_cpu_unrealize(pc->threads[i], pc->chip);
+        pnv_core_cpu_unrealize(pc, pc->threads[i]);
     }
     g_free(pc->threads);
 }
 
 static Property pnv_core_properties[] = {
     DEFINE_PROP_UINT32("pir", PnvCore, pir, 0),
+    DEFINE_PROP_UINT64("hrmor", PnvCore, hrmor, 0),
     DEFINE_PROP_LINK("chip", PnvCore, chip, TYPE_PNV_CHIP, PnvChip *),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -324,6 +325,7 @@
     dc->realize = pnv_core_realize;
     dc->unrealize = pnv_core_unrealize;
     device_class_set_props(dc, pnv_core_properties);
+    dc->user_creatable = false;
 }
 
 #define DEFINE_PNV_CORE_TYPE(family, cpu_model) \
@@ -422,6 +424,7 @@
 
     dc->realize = pnv_quad_realize;
     device_class_set_props(dc, pnv_quad_properties);
+    dc->user_creatable = false;
 }
 
 static const TypeInfo pnv_quad_info = {
diff --git a/hw/ppc/pnv_homer.c b/hw/ppc/pnv_homer.c
index 93ae42f..9a26262 100644
--- a/hw/ppc/pnv_homer.c
+++ b/hw/ppc/pnv_homer.c
@@ -360,6 +360,7 @@
     dc->realize = pnv_homer_realize;
     dc->desc = "PowerNV HOMER Memory";
     device_class_set_props(dc, pnv_homer_properties);
+    dc->user_creatable = false;
 }
 
 static const TypeInfo pnv_homer_type_info = {
diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index 22b2055..5989d72 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -762,6 +762,7 @@
     dc->realize = pnv_lpc_realize;
     dc->desc = "PowerNV LPC Controller";
     device_class_set_props(dc, pnv_lpc_properties);
+    dc->user_creatable = false;
 }
 
 static const TypeInfo pnv_lpc_info = {
@@ -825,6 +826,7 @@
     qemu_irq *irqs;
     qemu_irq_handler handler;
     PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
+    bool hostboot_mode = !!pnv->fw_load_addr;
 
     /* let isa_bus_new() create its own bridge on SysBus otherwise
      * devices speficied on the command line won't find the bus and
@@ -859,7 +861,9 @@
      * Start disabled. The HIOMAP protocol will activate the mapping
      * with HIOMAP_C_CREATE_WRITE_WINDOW
      */
-    memory_region_set_enabled(&pnv->pnor->mmio, false);
+    if (!hostboot_mode) {
+        memory_region_set_enabled(&pnv->pnor->mmio, false);
+    }
 
     return isa_bus;
 }
diff --git a/hw/ppc/pnv_occ.c b/hw/ppc/pnv_occ.c
index 2173fac..5a716c2 100644
--- a/hw/ppc/pnv_occ.c
+++ b/hw/ppc/pnv_occ.c
@@ -280,6 +280,7 @@
     dc->realize = pnv_occ_realize;
     dc->desc = "PowerNV OCC Controller";
     device_class_set_props(dc, pnv_occ_properties);
+    dc->user_creatable = false;
 }
 
 static const TypeInfo pnv_occ_type_info = {
diff --git a/hw/ppc/pnv_pnor.c b/hw/ppc/pnv_pnor.c
index f761d8d..c365ee5 100644
--- a/hw/ppc/pnv_pnor.c
+++ b/hw/ppc/pnv_pnor.c
@@ -11,6 +11,7 @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/log.h"
+#include "qemu/units.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "hw/loader.h"
@@ -46,7 +47,8 @@
     ret = blk_pwrite(s->blk, offset, s->storage + offset,
                      offset_end - offset, 0);
     if (ret < 0) {
-        error_report("Could not update PNOR: %s", strerror(-ret));
+        error_report("Could not update PNOR offset=0x%" PRIx32" : %s", offset,
+                     strerror(-ret));
     }
 }
 
@@ -111,7 +113,7 @@
 }
 
 static Property pnv_pnor_properties[] = {
-    DEFINE_PROP_INT64("size", PnvPnor, size, 128 << 20),
+    DEFINE_PROP_INT64("size", PnvPnor, size, 128 * MiB),
     DEFINE_PROP_DRIVE("drive", PnvPnor, blk),
     DEFINE_PROP_END_OF_LIST(),
 };
diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 4c5fa29..4a11fb1 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -1490,24 +1490,6 @@
 }
 
 /*****************************************************************************/
-/* Debug port */
-void PPC_debug_write (void *opaque, uint32_t addr, uint32_t val)
-{
-    addr &= 0xF;
-    switch (addr) {
-    case 0:
-        printf("%c", val);
-        break;
-    case 1:
-        printf("\n");
-        fflush(stdout);
-        break;
-    case 2:
-        printf("Set loglevel to %04" PRIx32 "\n", val);
-        qemu_set_log(val | 0x100);
-        break;
-    }
-}
 
 int ppc_cpu_pir(PowerPCCPU *cpu)
 {
diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c
index 862345c..111cc80 100644
--- a/hw/ppc/prep.c
+++ b/hw/ppc/prep.c
@@ -42,7 +42,7 @@
 #include "hw/loader.h"
 #include "hw/rtc/mc146818rtc.h"
 #include "hw/isa/pc87312.h"
-#include "hw/net/ne2000-isa.h"
+#include "hw/qdev-properties.h"
 #include "sysemu/arch_init.h"
 #include "sysemu/kvm.h"
 #include "sysemu/qtest.h"
@@ -60,178 +60,9 @@
 
 #define CFG_ADDR 0xf0000510
 
-#define BIOS_SIZE (1 * MiB)
-#define BIOS_FILENAME "ppc_rom.bin"
 #define KERNEL_LOAD_ADDR 0x01000000
 #define INITRD_LOAD_ADDR 0x01800000
 
-/* Constants for devices init */
-static const int ide_iobase[2] = { 0x1f0, 0x170 };
-static const int ide_iobase2[2] = { 0x3f6, 0x376 };
-static const int ide_irq[2] = { 13, 13 };
-
-#define NE2000_NB_MAX 6
-
-static uint32_t ne2000_io[NE2000_NB_MAX] = { 0x300, 0x320, 0x340, 0x360, 0x280, 0x380 };
-static int ne2000_irq[NE2000_NB_MAX] = { 9, 10, 11, 3, 4, 5 };
-
-/* ISA IO ports bridge */
-#define PPC_IO_BASE 0x80000000
-
-/* Fake super-io ports for PREP platform (Intel 82378ZB) */
-typedef struct sysctrl_t {
-    qemu_irq reset_irq;
-    Nvram *nvram;
-    uint8_t state;
-    uint8_t syscontrol;
-    int contiguous_map;
-    qemu_irq contiguous_map_irq;
-    int endian;
-} sysctrl_t;
-
-enum {
-    STATE_HARDFILE = 0x01,
-};
-
-static sysctrl_t *sysctrl;
-
-static void PREP_io_800_writeb (void *opaque, uint32_t addr, uint32_t val)
-{
-    sysctrl_t *sysctrl = opaque;
-
-    trace_prep_io_800_writeb(addr - PPC_IO_BASE, val);
-    switch (addr) {
-    case 0x0092:
-        /* Special port 92 */
-        /* Check soft reset asked */
-        if (val & 0x01) {
-            qemu_irq_raise(sysctrl->reset_irq);
-        } else {
-            qemu_irq_lower(sysctrl->reset_irq);
-        }
-        /* Check LE mode */
-        if (val & 0x02) {
-            sysctrl->endian = 1;
-        } else {
-            sysctrl->endian = 0;
-        }
-        break;
-    case 0x0800:
-        /* Motorola CPU configuration register : read-only */
-        break;
-    case 0x0802:
-        /* Motorola base module feature register : read-only */
-        break;
-    case 0x0803:
-        /* Motorola base module status register : read-only */
-        break;
-    case 0x0808:
-        /* Hardfile light register */
-        if (val & 1)
-            sysctrl->state |= STATE_HARDFILE;
-        else
-            sysctrl->state &= ~STATE_HARDFILE;
-        break;
-    case 0x0810:
-        /* Password protect 1 register */
-        if (sysctrl->nvram != NULL) {
-            NvramClass *k = NVRAM_GET_CLASS(sysctrl->nvram);
-            (k->toggle_lock)(sysctrl->nvram, 1);
-        }
-        break;
-    case 0x0812:
-        /* Password protect 2 register */
-        if (sysctrl->nvram != NULL) {
-            NvramClass *k = NVRAM_GET_CLASS(sysctrl->nvram);
-            (k->toggle_lock)(sysctrl->nvram, 2);
-        }
-        break;
-    case 0x0814:
-        /* L2 invalidate register */
-        //        tlb_flush(first_cpu, 1);
-        break;
-    case 0x081C:
-        /* system control register */
-        sysctrl->syscontrol = val & 0x0F;
-        break;
-    case 0x0850:
-        /* I/O map type register */
-        sysctrl->contiguous_map = val & 0x01;
-        qemu_set_irq(sysctrl->contiguous_map_irq, sysctrl->contiguous_map);
-        break;
-    default:
-        printf("ERROR: unaffected IO port write: %04" PRIx32
-               " => %02" PRIx32"\n", addr, val);
-        break;
-    }
-}
-
-static uint32_t PREP_io_800_readb (void *opaque, uint32_t addr)
-{
-    sysctrl_t *sysctrl = opaque;
-    uint32_t retval = 0xFF;
-
-    switch (addr) {
-    case 0x0092:
-        /* Special port 92 */
-        retval = sysctrl->endian << 1;
-        break;
-    case 0x0800:
-        /* Motorola CPU configuration register */
-        retval = 0xEF; /* MPC750 */
-        break;
-    case 0x0802:
-        /* Motorola Base module feature register */
-        retval = 0xAD; /* No ESCC, PMC slot neither ethernet */
-        break;
-    case 0x0803:
-        /* Motorola base module status register */
-        retval = 0xE0; /* Standard MPC750 */
-        break;
-    case 0x080C:
-        /* Equipment present register:
-         *  no L2 cache
-         *  no upgrade processor
-         *  no cards in PCI slots
-         *  SCSI fuse is bad
-         */
-        retval = 0x3C;
-        break;
-    case 0x0810:
-        /* Motorola base module extended feature register */
-        retval = 0x39; /* No USB, CF and PCI bridge. NVRAM present */
-        break;
-    case 0x0814:
-        /* L2 invalidate: don't care */
-        break;
-    case 0x0818:
-        /* Keylock */
-        retval = 0x00;
-        break;
-    case 0x081C:
-        /* system control register
-         * 7 - 6 / 1 - 0: L2 cache enable
-         */
-        retval = sysctrl->syscontrol;
-        break;
-    case 0x0823:
-        /* */
-        retval = 0x03; /* no L2 cache */
-        break;
-    case 0x0850:
-        /* I/O map type register */
-        retval = sysctrl->contiguous_map;
-        break;
-    default:
-        printf("ERROR: unaffected IO port: %04" PRIx32 " read\n", addr);
-        break;
-    }
-    trace_prep_io_800_readb(addr - PPC_IO_BASE, retval);
-
-    return retval;
-}
-
-
 #define NVRAM_SIZE        0x2000
 
 static void fw_cfg_boot_set(void *opaque, const char *boot_device,
@@ -247,17 +78,6 @@
     cpu_reset(CPU(cpu));
 }
 
-static const MemoryRegionPortio prep_portio_list[] = {
-    /* System control ports */
-    { 0x0092, 1, 1, .read = PREP_io_800_readb, .write = PREP_io_800_writeb, },
-    { 0x0800, 0x52, 1,
-      .read = PREP_io_800_readb, .write = PREP_io_800_writeb, },
-    /* Special port to get debug messages from Open-Firmware */
-    { 0x0F00, 4, 1, .write = PPC_debug_write, },
-    PORTIO_END_OF_LIST(),
-};
-
-static PortioList prep_port_list;
 
 /*****************************************************************************/
 /* NVRAM helpers */
@@ -397,207 +217,6 @@
     return 0;
 }
 
-/* PowerPC PREP hardware initialisation */
-static void ppc_prep_init(MachineState *machine)
-{
-    ram_addr_t ram_size = machine->ram_size;
-    const char *kernel_filename = machine->kernel_filename;
-    const char *kernel_cmdline = machine->kernel_cmdline;
-    const char *initrd_filename = machine->initrd_filename;
-    const char *boot_device = machine->boot_order;
-    MemoryRegion *sysmem = get_system_memory();
-    PowerPCCPU *cpu = NULL;
-    CPUPPCState *env = NULL;
-    Nvram *m48t59;
-#if 0
-    MemoryRegion *xcsr = g_new(MemoryRegion, 1);
-#endif
-    int linux_boot, i, nb_nics1;
-    MemoryRegion *ram = g_new(MemoryRegion, 1);
-    uint32_t kernel_base, initrd_base;
-    long kernel_size, initrd_size;
-    DeviceState *dev;
-    PCIHostState *pcihost;
-    PCIBus *pci_bus;
-    PCIDevice *pci;
-    ISABus *isa_bus;
-    ISADevice *isa;
-    int ppc_boot_device;
-    DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
-
-    sysctrl = g_malloc0(sizeof(sysctrl_t));
-
-    linux_boot = (kernel_filename != NULL);
-
-    /* init CPUs */
-    for (i = 0; i < machine->smp.cpus; i++) {
-        cpu = POWERPC_CPU(cpu_create(machine->cpu_type));
-        env = &cpu->env;
-
-        if (env->flags & POWERPC_FLAG_RTC_CLK) {
-            /* POWER / PowerPC 601 RTC clock frequency is 7.8125 MHz */
-            cpu_ppc_tb_init(env, 7812500UL);
-        } else {
-            /* Set time-base frequency to 100 Mhz */
-            cpu_ppc_tb_init(env, 100UL * 1000UL * 1000UL);
-        }
-        qemu_register_reset(ppc_prep_reset, cpu);
-    }
-
-    /* allocate RAM */
-    memory_region_allocate_system_memory(ram, NULL, "ppc_prep.ram", ram_size);
-    memory_region_add_subregion(sysmem, 0, ram);
-
-    if (linux_boot) {
-        kernel_base = KERNEL_LOAD_ADDR;
-        /* now we can load the kernel */
-        kernel_size = load_image_targphys(kernel_filename, kernel_base,
-                                          ram_size - kernel_base);
-        if (kernel_size < 0) {
-            error_report("could not load kernel '%s'", kernel_filename);
-            exit(1);
-        }
-        /* load initrd */
-        if (initrd_filename) {
-            initrd_base = INITRD_LOAD_ADDR;
-            initrd_size = load_image_targphys(initrd_filename, initrd_base,
-                                              ram_size - initrd_base);
-            if (initrd_size < 0) {
-                error_report("could not load initial ram disk '%s'",
-                             initrd_filename);
-                exit(1);
-            }
-        } else {
-            initrd_base = 0;
-            initrd_size = 0;
-        }
-        ppc_boot_device = 'm';
-    } else {
-        kernel_base = 0;
-        kernel_size = 0;
-        initrd_base = 0;
-        initrd_size = 0;
-        ppc_boot_device = '\0';
-        /* For now, OHW cannot boot from the network. */
-        for (i = 0; boot_device[i] != '\0'; i++) {
-            if (boot_device[i] >= 'a' && boot_device[i] <= 'f') {
-                ppc_boot_device = boot_device[i];
-                break;
-            }
-        }
-        if (ppc_boot_device == '\0') {
-            error_report("No valid boot device for Mac99 machine");
-            exit(1);
-        }
-    }
-
-    if (PPC_INPUT(env) != PPC_FLAGS_INPUT_6xx) {
-        error_report("Only 6xx bus is supported on PREP machine");
-        exit(1);
-    }
-
-    dev = qdev_create(NULL, "raven-pcihost");
-    if (bios_name == NULL) {
-        bios_name = BIOS_FILENAME;
-    }
-    qdev_prop_set_string(dev, "bios-name", bios_name);
-    qdev_prop_set_uint32(dev, "elf-machine", PPC_ELF_MACHINE);
-    qdev_prop_set_bit(dev, "is-legacy-prep", true);
-    pcihost = PCI_HOST_BRIDGE(dev);
-    object_property_add_child(qdev_get_machine(), "raven", OBJECT(dev), NULL);
-    qdev_init_nofail(dev);
-    pci_bus = (PCIBus *)qdev_get_child_bus(dev, "pci.0");
-    if (pci_bus == NULL) {
-        error_report("Couldn't create PCI host controller");
-        exit(1);
-    }
-    sysctrl->contiguous_map_irq = qdev_get_gpio_in(dev, 0);
-
-    /* PCI -> ISA bridge */
-    pci = pci_create_simple(pci_bus, PCI_DEVFN(1, 0), "i82378");
-    cpu = POWERPC_CPU(first_cpu);
-    qdev_connect_gpio_out(&pci->qdev, 0,
-                          cpu->env.irq_inputs[PPC6xx_INPUT_INT]);
-    sysbus_connect_irq(&pcihost->busdev, 0, qdev_get_gpio_in(&pci->qdev, 9));
-    sysbus_connect_irq(&pcihost->busdev, 1, qdev_get_gpio_in(&pci->qdev, 11));
-    sysbus_connect_irq(&pcihost->busdev, 2, qdev_get_gpio_in(&pci->qdev, 9));
-    sysbus_connect_irq(&pcihost->busdev, 3, qdev_get_gpio_in(&pci->qdev, 11));
-    isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci), "isa.0"));
-
-    /* Super I/O (parallel + serial ports) */
-    isa = isa_create(isa_bus, TYPE_PC87312_SUPERIO);
-    dev = DEVICE(isa);
-    qdev_prop_set_uint8(dev, "config", 13); /* fdc, ser0, ser1, par0 */
-    qdev_init_nofail(dev);
-
-    /* init basic PC hardware */
-    pci_vga_init(pci_bus);
-
-    nb_nics1 = nb_nics;
-    if (nb_nics1 > NE2000_NB_MAX)
-        nb_nics1 = NE2000_NB_MAX;
-    for(i = 0; i < nb_nics1; i++) {
-        if (nd_table[i].model == NULL) {
-            nd_table[i].model = g_strdup("ne2k_isa");
-        }
-        if (strcmp(nd_table[i].model, "ne2k_isa") == 0) {
-            isa_ne2000_init(isa_bus, ne2000_io[i], ne2000_irq[i],
-                            &nd_table[i]);
-        } else {
-            pci_nic_init_nofail(&nd_table[i], pci_bus, "ne2k_pci", NULL);
-        }
-    }
-
-    ide_drive_get(hd, ARRAY_SIZE(hd));
-    for(i = 0; i < MAX_IDE_BUS; i++) {
-        isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], ide_irq[i],
-                     hd[2 * i],
-                     hd[2 * i + 1]);
-    }
-
-    cpu = POWERPC_CPU(first_cpu);
-    sysctrl->reset_irq = cpu->env.irq_inputs[PPC6xx_INPUT_HRESET];
-
-    portio_list_init(&prep_port_list, NULL, prep_portio_list, sysctrl, "prep");
-    portio_list_add(&prep_port_list, isa_address_space_io(isa), 0x0);
-
-    /*
-     * PowerPC control and status register group: unimplemented,
-     * would be at address 0xFEFF0000.
-     */
-
-    if (machine_usb(machine)) {
-        pci_create_simple(pci_bus, -1, "pci-ohci");
-    }
-
-    m48t59 = m48t59_init_isa(isa_bus, 0x0074, NVRAM_SIZE, 2000, 59);
-    if (m48t59 == NULL)
-        return;
-    sysctrl->nvram = m48t59;
-
-    /* Initialise NVRAM */
-    PPC_NVRAM_set_params(m48t59, NVRAM_SIZE, "PREP", ram_size,
-                         ppc_boot_device,
-                         kernel_base, kernel_size,
-                         kernel_cmdline,
-                         initrd_base, initrd_size,
-                         /* XXX: need an option to load a NVRAM image */
-                         0,
-                         graphic_width, graphic_height, graphic_depth);
-}
-
-static void prep_machine_init(MachineClass *mc)
-{
-    mc->deprecation_reason = "use 40p machine type instead";
-    mc->desc = "PowerPC PREP platform";
-    mc->init = ppc_prep_init;
-    mc->block_default_type = IF_IDE;
-    mc->max_cpus = MAX_CPUS;
-    mc->default_boot_order = "cad";
-    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("602");
-    mc->default_display = "std";
-}
-
 static int prep_set_cmos_checksum(DeviceState *dev, void *opaque)
 {
     uint16_t checksum = *(uint16_t *)opaque;
@@ -821,4 +440,3 @@
 }
 
 DEFINE_MACHINE("40p", ibm_40p_machine_init)
-DEFINE_MACHINE("prep", prep_machine_init)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index a0076e5..c9b2e0a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -46,6 +46,7 @@
 #include "migration/qemu-file-types.h"
 #include "migration/global_state.h"
 #include "migration/register.h"
+#include "migration/blocker.h"
 #include "mmu-hash64.h"
 #include "mmu-book3s-v3.h"
 #include "cpu-models.h"
@@ -1677,6 +1678,14 @@
     first_ppc_cpu->env.gpr[5] = 0;
 
     spapr->cas_reboot = false;
+
+    spapr->mc_status = -1;
+    spapr->guest_machine_check_addr = -1;
+
+    /* Signal all vCPUs waiting on this condition */
+    qemu_cond_broadcast(&spapr->mc_delivery_cond);
+
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
 }
 
 static void spapr_create_nvram(SpaprMachineState *spapr)
@@ -1959,6 +1968,42 @@
     },
 };
 
+static bool spapr_fwnmi_needed(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    return spapr->guest_machine_check_addr != -1;
+}
+
+static int spapr_fwnmi_pre_save(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    /*
+     * Check if machine check handling is in progress and print a
+     * warning message.
+     */
+    if (spapr->mc_status != -1) {
+        warn_report("A machine check is being handled during migration. The"
+                "handler may run and log hardware error on the destination");
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_spapr_machine_check = {
+    .name = "spapr_machine_check",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_fwnmi_needed,
+    .pre_save = spapr_fwnmi_pre_save,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
+        VMSTATE_INT32(mc_status, SpaprMachineState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
     .version_id = 3,
@@ -1992,6 +2037,8 @@
         &vmstate_spapr_dtb,
         &vmstate_spapr_cap_large_decr,
         &vmstate_spapr_cap_ccf_assist,
+        &vmstate_spapr_cap_fwnmi,
+        &vmstate_spapr_machine_check,
         NULL
     }
 };
@@ -2807,6 +2854,13 @@
         spapr_create_lmb_dr_connectors(spapr);
     }
 
+    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_ON) {
+        /* Create the error string for live migration blocker */
+        error_setg(&spapr->fwnmi_migration_blocker,
+            "A machine check is being handled during migration. The handler"
+            "may run and log hardware error on the destination");
+    }
+
     /* Set up RTAS event infrastructure */
     spapr_events_init(spapr);
 
@@ -2970,6 +3024,8 @@
 
         kvmppc_spapr_enable_inkernel_multitce();
     }
+
+    qemu_cond_init(&spapr->mc_delivery_cond);
 }
 
 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
@@ -4397,7 +4453,8 @@
     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
-    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
+    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
+    smc->default_caps.caps[SPAPR_CAP_FWNMI_MCE] = SPAPR_CAP_ON;
     spapr_caps_add_properties(smc, &error_abort);
     smc->irq = &spapr_irq_dual;
     smc->dr_phb_enabled = true;
@@ -4465,8 +4522,12 @@
  */
 static void spapr_machine_4_2_class_options(MachineClass *mc)
 {
+    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
     spapr_machine_5_0_class_options(mc);
     compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
+    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
+    smc->default_caps.caps[SPAPR_CAP_FWNMI_MCE] = SPAPR_CAP_OFF;
 }
 
 DEFINE_SPAPR_MACHINE(4_2, "4.2", false);
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 481dfd2..8b27d3a 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -485,17 +485,48 @@
     uint8_t kvm_val = kvmppc_get_cap_count_cache_flush_assist();
 
     if (tcg_enabled() && val) {
-        /* TODO - for now only allow broken for TCG */
-        error_setg(errp,
-"Requested count cache flush assist capability level not supported by tcg,"
-                   " try appending -machine cap-ccf-assist=off");
+        /* TCG doesn't implement anything here, but allow with a warning */
+        warn_report("TCG doesn't support requested feature, cap-ccf-assist=on");
     } else if (kvm_enabled() && (val > kvm_val)) {
+        uint8_t kvm_ibs = kvmppc_get_cap_safe_indirect_branch();
+
+        if (kvm_ibs == SPAPR_CAP_FIXED_CCD) {
+            /*
+             * If we don't have CCF assist on the host, the assist
+             * instruction is a harmless no-op.  It won't correctly
+             * implement the cache count flush *but* if we have
+             * count-cache-disabled in the host, that flush is
+             * unnnecessary.  So, specifically allow this case.  This
+             * allows us to have better performance on POWER9 DD2.3,
+             * while still working on POWER9 DD2.2 and POWER8 host
+             * cpus.
+             */
+            return;
+        }
         error_setg(errp,
 "Requested count cache flush assist capability level not supported by kvm,"
                    " try appending -machine cap-ccf-assist=off");
     }
 }
 
+static void cap_fwnmi_mce_apply(SpaprMachineState *spapr, uint8_t val,
+                                Error **errp)
+{
+    if (!val) {
+        return; /* Disabled by default */
+    }
+
+    if (tcg_enabled()) {
+        warn_report("Firmware Assisted Non-Maskable Interrupts(FWNMI) not "
+                    "supported in TCG");
+    } else if (kvm_enabled()) {
+        if (kvmppc_set_fwnmi() < 0) {
+            error_setg(errp, "Firmware Assisted Non-Maskable Interrupts(FWNMI) "
+                             "not supported by KVM");
+        }
+    }
+}
+
 SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
     [SPAPR_CAP_HTM] = {
         .name = "htm",
@@ -595,6 +626,15 @@
         .type = "bool",
         .apply = cap_ccf_assist_apply,
     },
+    [SPAPR_CAP_FWNMI_MCE] = {
+        .name = "fwnmi-mce",
+        .description = "Handle fwnmi machine check exceptions",
+        .index = SPAPR_CAP_FWNMI_MCE,
+        .get = spapr_cap_get_bool,
+        .set = spapr_cap_set_bool,
+        .type = "bool",
+        .apply = cap_fwnmi_mce_apply,
+    },
 };
 
 static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
@@ -734,6 +774,7 @@
 SPAPR_CAP_MIG_STATE(nested_kvm_hv, SPAPR_CAP_NESTED_KVM_HV);
 SPAPR_CAP_MIG_STATE(large_decr, SPAPR_CAP_LARGE_DECREMENTER);
 SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST);
+SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI_MCE);
 
 void spapr_caps_init(SpaprMachineState *spapr)
 {
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index e355e00..884e455 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -40,8 +40,10 @@
 #include "hw/ppc/spapr_drc.h"
 #include "qemu/help_option.h"
 #include "qemu/bcd.h"
+#include "qemu/main-loop.h"
 #include "hw/ppc/spapr_ovec.h"
 #include <libfdt.h>
+#include "migration/blocker.h"
 
 #define RTAS_LOG_VERSION_MASK                   0xff000000
 #define   RTAS_LOG_VERSION_6                    0x06000000
@@ -213,6 +215,104 @@
     struct rtas_event_log_v6_hp hp;
 } QEMU_PACKED;
 
+struct rtas_event_log_v6_mc {
+#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
+    struct rtas_event_log_v6_section_header hdr;
+    uint32_t fru_id;
+    uint32_t proc_id;
+    uint8_t error_type;
+#define RTAS_LOG_V6_MC_TYPE_UE                           0
+#define RTAS_LOG_V6_MC_TYPE_SLB                          1
+#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
+#define RTAS_LOG_V6_MC_TYPE_TLB                          4
+#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
+#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
+    uint8_t sub_err_type;
+#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
+#define RTAS_LOG_V6_MC_UE_IFETCH                         1
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
+#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
+#define RTAS_LOG_V6_MC_SLB_PARITY                        0
+#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
+#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
+#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
+#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
+#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
+#define RTAS_LOG_V6_MC_TLB_PARITY                        1
+#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
+#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
+    uint8_t reserved_1[6];
+    uint64_t effective_address;
+    uint64_t logical_address;
+} QEMU_PACKED;
+
+struct mc_extended_log {
+    struct rtas_event_log_v6 v6hdr;
+    struct rtas_event_log_v6_mc mc;
+} QEMU_PACKED;
+
+struct MC_ierror_table {
+    unsigned long srr1_mask;
+    unsigned long srr1_value;
+    bool nip_valid; /* nip is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_ierror_table mc_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, } };
+
+struct MC_derror_table {
+    unsigned long dsisr_value;
+    bool dar_valid; /* dar is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_derror_table mc_derror_table[] = {
+{ 0x00008000, false,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00004000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000800, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000400, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000080, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000100, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, } };
+
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
 typedef enum EventClass {
     EVENT_CLASS_INTERNAL_ERRORS     = 0,
     EVENT_CLASS_EPOW                = 1,
@@ -622,6 +722,175 @@
                             RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
 }
 
+static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
+                                        struct mc_extended_log *ext_elog)
+{
+    int i;
+    CPUPPCState *env = &cpu->env;
+    uint32_t summary;
+    uint64_t dsisr = env->spr[SPR_DSISR];
+
+    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
+    if (recovered) {
+        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
+    } else {
+        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
+    }
+
+    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
+        for (i = 0; i < ARRAY_SIZE(mc_derror_table); i++) {
+            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_derror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
+            if (mc_derror_table[i].dar_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
+            }
+
+            summary |= mc_derror_table[i].initiator
+                        | mc_derror_table[i].severity;
+
+            return summary;
+        }
+    } else {
+        for (i = 0; i < ARRAY_SIZE(mc_ierror_table); i++) {
+            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
+                    mc_ierror_table[i].srr1_value) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
+            if (mc_ierror_table[i].nip_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
+            }
+
+            summary |= mc_ierror_table[i].initiator
+                        | mc_ierror_table[i].severity;
+
+            return summary;
+        }
+    }
+
+    summary |= RTAS_LOG_INITIATOR_CPU;
+    return summary;
+}
+
+static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUState *cs = CPU(cpu);
+    uint64_t rtas_addr;
+    CPUPPCState *env = &cpu->env;
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong msr = 0;
+    struct rtas_error_log log;
+    struct mc_extended_log *ext_elog;
+    uint32_t summary;
+
+    /*
+     * Properly set bits in MSR before we invoke the handler.
+     * SRR0/1, DAR and DSISR are properly set by KVM
+     */
+    if (!(*pcc->interrupts_big_endian)(cpu)) {
+        msr |= (1ULL << MSR_LE);
+    }
+
+    if (env->msr & (1ULL << MSR_SF)) {
+        msr |= (1ULL << MSR_SF);
+    }
+
+    msr |= (1ULL << MSR_ME);
+
+    ext_elog = g_malloc0(sizeof(*ext_elog));
+    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
+
+    log.summary = cpu_to_be32(summary);
+    log.extended_length = cpu_to_be32(sizeof(*ext_elog));
+
+    spapr_init_v6hdr(&ext_elog->v6hdr);
+    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
+    ext_elog->mc.hdr.section_length =
+                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
+    ext_elog->mc.hdr.section_version = 1;
+
+    /* get rtas addr from fdt */
+    rtas_addr = spapr_get_rtas_addr();
+    if (!rtas_addr) {
+        /* Unable to fetch rtas_addr. Hence reset the guest */
+        ppc_cpu_do_system_reset(cs);
+        g_free(ext_elog);
+        return;
+    }
+
+    stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
+                env->gpr[3]);
+    cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
+                              sizeof(env->gpr[3]), &log, sizeof(log));
+    cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
+                              sizeof(env->gpr[3]) + sizeof(log), ext_elog,
+                              sizeof(*ext_elog));
+
+    env->gpr[3] = rtas_addr + RTAS_ERROR_LOG_OFFSET;
+    env->msr = msr;
+    env->nip = spapr->guest_machine_check_addr;
+
+    g_free(ext_elog);
+}
+
+void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUState *cs = CPU(cpu);
+    int ret;
+    Error *local_err = NULL;
+
+    if (spapr->guest_machine_check_addr == -1) {
+        /*
+         * This implies that we have hit a machine check either when the
+         * guest has not registered FWNMI (i.e., "ibm,nmi-register" not
+         * called) or between system reset and "ibm,nmi-register".
+         * Fall back to the old machine check behavior in such cases.
+         */
+        cs->exception_index = POWERPC_EXCP_MCHECK;
+        ppc_cpu_do_interrupt(cs);
+        return;
+    }
+
+    while (spapr->mc_status != -1) {
+        /*
+         * Check whether the same CPU got machine check error
+         * while still handling the mc error (i.e., before
+         * that CPU called "ibm,nmi-interlock")
+         */
+        if (spapr->mc_status == cpu->vcpu_id) {
+            qemu_system_guest_panicked(NULL);
+            return;
+        }
+        qemu_cond_wait_iothread(&spapr->mc_delivery_cond);
+        /* Meanwhile if the system is reset, then just return */
+        if (spapr->guest_machine_check_addr == -1) {
+            return;
+        }
+    }
+
+    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
+    if (ret == -EBUSY) {
+        /*
+         * We don't want to abort so we let the migration to continue.
+         * In a rare case, the machine check handler will run on the target.
+         * Though this is not preferable, it is better than aborting
+         * the migration or killing the VM.
+         */
+        warn_report("Received a fwnmi while migration was in progress");
+    }
+
+    spapr->mc_status = cpu->vcpu_id;
+    spapr_mce_dispatch_elog(cpu, recovered);
+}
+
 static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
                             uint32_t token, uint32_t nargs,
                             target_ulong args,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f1799b1..b8bb66b 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1676,6 +1676,18 @@
     Error *local_err = NULL;
     bool raw_mode_supported = false;
     bool guest_xive;
+    CPUState *cs;
+
+    /* CAS is supposed to be called early when only the boot vCPU is active. */
+    CPU_FOREACH(cs) {
+        if (cs == CPU(cpu)) {
+            continue;
+        }
+        if (!cs->halted) {
+            warn_report("guest has multiple active vCPUs at CAS, which is not allowed");
+            return H_MULTI_THREADS_ACTIVE;
+        }
+    }
 
     cas_pvr = cas_check_pvr(spapr, cpu, &addr, &raw_mode_supported, &local_err);
     if (local_err) {
@@ -1703,7 +1715,15 @@
     ov_table = addr;
 
     ov1_guest = spapr_ovec_parse_vector(ov_table, 1);
+    if (!ov1_guest) {
+        warn_report("guest didn't provide option vector 1");
+        return H_PARAMETER;
+    }
     ov5_guest = spapr_ovec_parse_vector(ov_table, 5);
+    if (!ov5_guest) {
+        warn_report("guest didn't provide option vector 5");
+        return H_PARAMETER;
+    }
     if (spapr_ovec_test(ov5_guest, OV5_MMU_BOTH)) {
         error_report("guest requested hash and radix MMU, which is invalid.");
         exit(EXIT_FAILURE);
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 85135e0..883fe28 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -50,6 +50,7 @@
 #include "hw/ppc/fdt.h"
 #include "target/ppc/mmu-hash64.h"
 #include "target/ppc/mmu-book3s-v3.h"
+#include "migration/blocker.h"
 
 static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                    uint32_t token, uint32_t nargs,
@@ -399,6 +400,62 @@
     rtas_st(rets, 1, 100);
 }
 
+static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
+                                  SpaprMachineState *spapr,
+                                  uint32_t token, uint32_t nargs,
+                                  target_ulong args,
+                                  uint32_t nret, target_ulong rets)
+{
+    hwaddr rtas_addr;
+
+    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_OFF) {
+        rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
+        return;
+    }
+
+    rtas_addr = spapr_get_rtas_addr();
+    if (!rtas_addr) {
+        rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
+        return;
+    }
+
+    spapr->guest_machine_check_addr = rtas_ld(args, 1);
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+}
+
+static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
+                                   SpaprMachineState *spapr,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_OFF) {
+        rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED);
+        return;
+    }
+
+    if (spapr->guest_machine_check_addr == -1) {
+        /* NMI register not called */
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    if (spapr->mc_status != cpu->vcpu_id) {
+        /* The vCPU that hit the NMI should invoke "ibm,nmi-interlock" */
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
+    /*
+     * vCPU issuing "ibm,nmi-interlock" is done with NMI handling,
+     * hence unset mc_status.
+     */
+    spapr->mc_status = -1;
+    qemu_cond_signal(&spapr->mc_delivery_cond);
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
+}
+
 static struct rtas_call {
     const char *name;
     spapr_rtas_fn fn;
@@ -476,6 +533,32 @@
     }
 }
 
+hwaddr spapr_get_rtas_addr(void)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    int rtas_node;
+    const fdt32_t *rtas_data;
+    void *fdt = spapr->fdt_blob;
+
+    /* fetch rtas addr from fdt */
+    rtas_node = fdt_path_offset(fdt, "/rtas");
+    if (rtas_node < 0) {
+        return 0;
+    }
+
+    rtas_data = fdt_getprop(fdt, rtas_node, "linux,rtas-base", NULL);
+    if (!rtas_data) {
+        return 0;
+    }
+
+    /*
+     * We assume that the OS called RTAS instantiate-rtas, but some other
+     * OS might call RTAS instantiate-rtas-64 instead. This fine as of now
+     * as SLOF only supports 32-bit variant.
+     */
+    return (hwaddr)fdt32_to_cpu(*rtas_data);
+}
+
 static void core_rtas_register_types(void)
 {
     spapr_rtas_register(RTAS_DISPLAY_CHARACTER, "display-character",
@@ -501,6 +584,10 @@
                         rtas_set_power_level);
     spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
                         rtas_get_power_level);
+    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
+                        rtas_ibm_nmi_register);
+    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
+                        rtas_ibm_nmi_interlock);
 }
 
 type_init(core_rtas_register_types)
diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
index f14944e..0b085ea 100644
--- a/hw/ppc/spapr_vio.c
+++ b/hw/ppc/spapr_vio.c
@@ -87,6 +87,7 @@
     SpaprVioDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev);
     int vdevice_off, node_off, ret;
     char *dt_name;
+    const char *dt_compatible;
 
     vdevice_off = fdt_path_offset(fdt, "/vdevice");
     if (vdevice_off < 0) {
@@ -113,9 +114,15 @@
         }
     }
 
-    if (pc->dt_compatible) {
+    if (pc->get_dt_compatible) {
+        dt_compatible = pc->get_dt_compatible(dev);
+    } else {
+        dt_compatible = pc->dt_compatible;
+    }
+
+    if (dt_compatible) {
         ret = fdt_setprop_string(fdt, node_off, "compatible",
-                                 pc->dt_compatible);
+                                 dt_compatible);
         if (ret < 0) {
             return ret;
         }
diff --git a/hw/ppc/virtex_ml507.c b/hw/ppc/virtex_ml507.c
index 7526947..91dd00e 100644
--- a/hw/ppc/virtex_ml507.c
+++ b/hw/ppc/virtex_ml507.c
@@ -89,10 +89,7 @@
     tlb->PID = 0;
 }
 
-static PowerPCCPU *ppc440_init_xilinx(ram_addr_t *ram_size,
-                                      int do_init,
-                                      const char *cpu_type,
-                                      uint32_t sysclk)
+static PowerPCCPU *ppc440_init_xilinx(const char *cpu_type, uint32_t sysclk)
 {
     PowerPCCPU *cpu;
     CPUPPCState *env;
@@ -213,7 +210,7 @@
     int i;
 
     /* init CPUs */
-    cpu = ppc440_init_xilinx(&ram_size, 1, machine->cpu_type, 400000000);
+    cpu = ppc440_init_xilinx(machine->cpu_type, 400000000);
     env = &cpu->env;
 
     if (env->mmu_model != POWERPC_MMU_BOOKE) {
diff --git a/hw/tpm/Kconfig b/hw/tpm/Kconfig
index 4c8ee87..9e67d99 100644
--- a/hw/tpm/Kconfig
+++ b/hw/tpm/Kconfig
@@ -22,3 +22,9 @@
     bool
     default y
     depends on TPMDEV
+
+config TPM_SPAPR
+    bool
+    default y
+    depends on TPM && PSERIES
+    select TPMDEV
diff --git a/hw/tpm/Makefile.objs b/hw/tpm/Makefile.objs
index de0b85d..85eb99a 100644
--- a/hw/tpm/Makefile.objs
+++ b/hw/tpm/Makefile.objs
@@ -4,3 +4,4 @@
 common-obj-$(CONFIG_TPM_CRB) += tpm_crb.o
 common-obj-$(CONFIG_TPM_PASSTHROUGH) += tpm_passthrough.o
 common-obj-$(CONFIG_TPM_EMULATOR) += tpm_emulator.o
+obj-$(CONFIG_TPM_SPAPR) += tpm_spapr.o
diff --git a/hw/tpm/tpm_spapr.c b/hw/tpm/tpm_spapr.c
new file mode 100644
index 0000000..ce65eb2
--- /dev/null
+++ b/hw/tpm/tpm_spapr.c
@@ -0,0 +1,429 @@
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * PAPR Virtual TPM
+ *
+ * Copyright (c) 2015, 2017, 2019 IBM Corporation.
+ *
+ * Authors:
+ *    Stefan Berger <stefanb@linux.vnet.ibm.com>
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "migration/vmstate.h"
+
+#include "sysemu/tpm_backend.h"
+#include "tpm_int.h"
+#include "tpm_util.h"
+
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_vio.h"
+#include "trace.h"
+
+#define DEBUG_SPAPR 0
+
+#define VIO_SPAPR_VTPM(obj) \
+     OBJECT_CHECK(SpaprTpmState, (obj), TYPE_TPM_SPAPR)
+
+typedef struct TpmCrq {
+    uint8_t valid;  /* 0x80: cmd; 0xc0: init crq */
+                    /* 0x81-0x83: CRQ message response */
+    uint8_t msg;    /* see below */
+    uint16_t len;   /* len of TPM request; len of TPM response */
+    uint32_t data;  /* rtce_dma_handle when sending TPM request */
+    uint64_t reserved;
+} TpmCrq;
+
+#define SPAPR_VTPM_VALID_INIT_CRQ_COMMAND  0xC0
+#define SPAPR_VTPM_VALID_COMMAND           0x80
+#define SPAPR_VTPM_MSG_RESULT              0x80
+
+/* msg types for valid = SPAPR_VTPM_VALID_INIT_CRQ */
+#define SPAPR_VTPM_INIT_CRQ_RESULT           0x1
+#define SPAPR_VTPM_INIT_CRQ_COMPLETE_RESULT  0x2
+
+/* msg types for valid = SPAPR_VTPM_VALID_CMD */
+#define SPAPR_VTPM_GET_VERSION               0x1
+#define SPAPR_VTPM_TPM_COMMAND               0x2
+#define SPAPR_VTPM_GET_RTCE_BUFFER_SIZE      0x3
+#define SPAPR_VTPM_PREPARE_TO_SUSPEND        0x4
+
+/* response error messages */
+#define SPAPR_VTPM_VTPM_ERROR                0xff
+
+/* error codes */
+#define SPAPR_VTPM_ERR_COPY_IN_FAILED        0x3
+#define SPAPR_VTPM_ERR_COPY_OUT_FAILED       0x4
+
+#define TPM_SPAPR_BUFFER_MAX                 4096
+
+typedef struct {
+    SpaprVioDevice vdev;
+
+    TpmCrq crq; /* track single TPM command */
+
+    uint8_t state;
+#define SPAPR_VTPM_STATE_NONE         0
+#define SPAPR_VTPM_STATE_EXECUTION    1
+#define SPAPR_VTPM_STATE_COMPLETION   2
+
+    unsigned char *buffer;
+
+    uint32_t numbytes; /* number of bytes to deliver on resume */
+
+    TPMBackendCmd cmd;
+
+    TPMBackend *be_driver;
+    TPMVersion be_tpm_version;
+
+    size_t be_buffer_size;
+} SpaprTpmState;
+
+/*
+ * Send a request to the TPM.
+ */
+static void tpm_spapr_tpm_send(SpaprTpmState *s)
+{
+    if (trace_event_get_state_backends(TRACE_TPM_SPAPR_SHOW_BUFFER)) {
+        tpm_util_show_buffer(s->buffer, s->be_buffer_size, "To TPM");
+    }
+
+    s->state = SPAPR_VTPM_STATE_EXECUTION;
+    s->cmd = (TPMBackendCmd) {
+        .locty = 0,
+        .in = s->buffer,
+        .in_len = MIN(tpm_cmd_get_size(s->buffer), s->be_buffer_size),
+        .out = s->buffer,
+        .out_len = s->be_buffer_size,
+    };
+
+    tpm_backend_deliver_request(s->be_driver, &s->cmd);
+}
+
+static int tpm_spapr_process_cmd(SpaprTpmState *s, uint64_t dataptr)
+{
+    long rc;
+
+    /* a max. of be_buffer_size bytes can be transported */
+    rc = spapr_vio_dma_read(&s->vdev, dataptr,
+                            s->buffer, s->be_buffer_size);
+    if (rc) {
+        error_report("tpm_spapr_got_payload: DMA read failure");
+    }
+    /* let vTPM handle any malformed request */
+    tpm_spapr_tpm_send(s);
+
+    return rc;
+}
+
+static inline int spapr_tpm_send_crq(struct SpaprVioDevice *dev, TpmCrq *crq)
+{
+    return spapr_vio_send_crq(dev, (uint8_t *)crq);
+}
+
+static int tpm_spapr_do_crq(struct SpaprVioDevice *dev, uint8_t *crq_data)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(dev);
+    TpmCrq local_crq;
+    TpmCrq *crq = &s->crq; /* requests only */
+    int rc;
+    uint8_t valid = crq_data[0];
+    uint8_t msg = crq_data[1];
+
+    trace_tpm_spapr_do_crq(valid, msg);
+
+    switch (valid) {
+    case SPAPR_VTPM_VALID_INIT_CRQ_COMMAND: /* Init command/response */
+
+        /* Respond to initialization request */
+        switch (msg) {
+        case SPAPR_VTPM_INIT_CRQ_RESULT:
+            trace_tpm_spapr_do_crq_crq_result();
+            memset(&local_crq, 0, sizeof(local_crq));
+            local_crq.valid = SPAPR_VTPM_VALID_INIT_CRQ_COMMAND;
+            local_crq.msg = SPAPR_VTPM_INIT_CRQ_RESULT;
+            spapr_tpm_send_crq(dev, &local_crq);
+            break;
+
+        case SPAPR_VTPM_INIT_CRQ_COMPLETE_RESULT:
+            trace_tpm_spapr_do_crq_crq_complete_result();
+            memset(&local_crq, 0, sizeof(local_crq));
+            local_crq.valid = SPAPR_VTPM_VALID_INIT_CRQ_COMMAND;
+            local_crq.msg = SPAPR_VTPM_INIT_CRQ_COMPLETE_RESULT;
+            spapr_tpm_send_crq(dev, &local_crq);
+            break;
+        }
+
+        break;
+    case SPAPR_VTPM_VALID_COMMAND: /* Payloads */
+        switch (msg) {
+        case SPAPR_VTPM_TPM_COMMAND:
+            trace_tpm_spapr_do_crq_tpm_command();
+            if (s->state == SPAPR_VTPM_STATE_EXECUTION) {
+                return H_BUSY;
+            }
+            memcpy(crq, crq_data, sizeof(*crq));
+
+            rc = tpm_spapr_process_cmd(s, be32_to_cpu(crq->data));
+
+            if (rc == H_SUCCESS) {
+                crq->valid = be16_to_cpu(0);
+            } else {
+                local_crq.valid = SPAPR_VTPM_MSG_RESULT;
+                local_crq.msg = SPAPR_VTPM_VTPM_ERROR;
+                local_crq.len = cpu_to_be16(0);
+                local_crq.data = cpu_to_be32(SPAPR_VTPM_ERR_COPY_IN_FAILED);
+                spapr_tpm_send_crq(dev, &local_crq);
+            }
+            break;
+
+        case SPAPR_VTPM_GET_RTCE_BUFFER_SIZE:
+            trace_tpm_spapr_do_crq_tpm_get_rtce_buffer_size(s->be_buffer_size);
+            local_crq.valid = SPAPR_VTPM_VALID_COMMAND;
+            local_crq.msg = SPAPR_VTPM_GET_RTCE_BUFFER_SIZE |
+                            SPAPR_VTPM_MSG_RESULT;
+            local_crq.len = cpu_to_be16(s->be_buffer_size);
+            spapr_tpm_send_crq(dev, &local_crq);
+            break;
+
+        case SPAPR_VTPM_GET_VERSION:
+            local_crq.valid = SPAPR_VTPM_VALID_COMMAND;
+            local_crq.msg = SPAPR_VTPM_GET_VERSION | SPAPR_VTPM_MSG_RESULT;
+            local_crq.len = cpu_to_be16(0);
+            switch (s->be_tpm_version) {
+            case TPM_VERSION_1_2:
+                local_crq.data = cpu_to_be32(1);
+                break;
+            case TPM_VERSION_2_0:
+                local_crq.data = cpu_to_be32(2);
+                break;
+            default:
+                g_assert_not_reached();
+                break;
+            }
+            trace_tpm_spapr_do_crq_get_version(be32_to_cpu(local_crq.data));
+            spapr_tpm_send_crq(dev, &local_crq);
+            break;
+
+        case SPAPR_VTPM_PREPARE_TO_SUSPEND:
+            trace_tpm_spapr_do_crq_prepare_to_suspend();
+            local_crq.valid = SPAPR_VTPM_VALID_COMMAND;
+            local_crq.msg = SPAPR_VTPM_PREPARE_TO_SUSPEND |
+                            SPAPR_VTPM_MSG_RESULT;
+            spapr_tpm_send_crq(dev, &local_crq);
+            break;
+
+        default:
+            trace_tpm_spapr_do_crq_unknown_msg_type(crq->msg);
+        }
+        break;
+    default:
+        trace_tpm_spapr_do_crq_unknown_crq(valid, msg);
+    };
+
+    return H_SUCCESS;
+}
+
+static void tpm_spapr_request_completed(TPMIf *ti, int ret)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(ti);
+    TpmCrq *crq = &s->crq;
+    uint32_t len;
+    int rc;
+
+    s->state = SPAPR_VTPM_STATE_COMPLETION;
+
+    /* a max. of be_buffer_size bytes can be transported */
+    len = MIN(tpm_cmd_get_size(s->buffer), s->be_buffer_size);
+
+    if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+        trace_tpm_spapr_caught_response(len);
+        /* defer delivery of response until .post_load */
+        s->numbytes = len;
+        return;
+    }
+
+    rc = spapr_vio_dma_write(&s->vdev, be32_to_cpu(crq->data),
+                             s->buffer, len);
+
+    if (trace_event_get_state_backends(TRACE_TPM_SPAPR_SHOW_BUFFER)) {
+        tpm_util_show_buffer(s->buffer, len, "From TPM");
+    }
+
+    crq->valid = SPAPR_VTPM_MSG_RESULT;
+    if (rc == H_SUCCESS) {
+        crq->msg = SPAPR_VTPM_TPM_COMMAND | SPAPR_VTPM_MSG_RESULT;
+        crq->len = cpu_to_be16(len);
+    } else {
+        error_report("%s: DMA write failure", __func__);
+        crq->msg = SPAPR_VTPM_VTPM_ERROR;
+        crq->len = cpu_to_be16(0);
+        crq->data = cpu_to_be32(SPAPR_VTPM_ERR_COPY_OUT_FAILED);
+    }
+
+    rc = spapr_tpm_send_crq(&s->vdev, crq);
+    if (rc) {
+        error_report("%s: Error sending response", __func__);
+    }
+}
+
+static int tpm_spapr_do_startup_tpm(SpaprTpmState *s, size_t buffersize)
+{
+    return tpm_backend_startup_tpm(s->be_driver, buffersize);
+}
+
+static const char *tpm_spapr_get_dt_compatible(SpaprVioDevice *dev)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(dev);
+
+    switch (s->be_tpm_version) {
+    case TPM_VERSION_1_2:
+        return "IBM,vtpm";
+    case TPM_VERSION_2_0:
+        return "IBM,vtpm20";
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void tpm_spapr_reset(SpaprVioDevice *dev)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(dev);
+
+    s->state = SPAPR_VTPM_STATE_NONE;
+    s->numbytes = 0;
+
+    s->be_tpm_version = tpm_backend_get_tpm_version(s->be_driver);
+
+    s->be_buffer_size = MIN(tpm_backend_get_buffer_size(s->be_driver),
+                            TPM_SPAPR_BUFFER_MAX);
+
+    tpm_backend_reset(s->be_driver);
+    tpm_spapr_do_startup_tpm(s, s->be_buffer_size);
+}
+
+static enum TPMVersion tpm_spapr_get_version(TPMIf *ti)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(ti);
+
+    if (tpm_backend_had_startup_error(s->be_driver)) {
+        return TPM_VERSION_UNSPEC;
+    }
+
+    return tpm_backend_get_tpm_version(s->be_driver);
+}
+
+/* persistent state handling */
+
+static int tpm_spapr_pre_save(void *opaque)
+{
+    SpaprTpmState *s = opaque;
+
+    tpm_backend_finish_sync(s->be_driver);
+    /*
+     * we cannot deliver the results to the VM since DMA would touch VM memory
+     */
+
+    return 0;
+}
+
+static int tpm_spapr_post_load(void *opaque, int version_id)
+{
+    SpaprTpmState *s = opaque;
+
+    if (s->numbytes) {
+        trace_tpm_spapr_post_load();
+        /* deliver the results to the VM via DMA */
+        tpm_spapr_request_completed(TPM_IF(s), 0);
+        s->numbytes = 0;
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_spapr_vtpm = {
+    .name = "tpm-spapr",
+    .pre_save = tpm_spapr_pre_save,
+    .post_load = tpm_spapr_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_SPAPR_VIO(vdev, SpaprTpmState),
+
+        VMSTATE_UINT8(state, SpaprTpmState),
+        VMSTATE_UINT32(numbytes, SpaprTpmState),
+        VMSTATE_VBUFFER_UINT32(buffer, SpaprTpmState, 0, NULL, numbytes),
+        /* remember DMA address */
+        VMSTATE_UINT32(crq.data, SpaprTpmState),
+        VMSTATE_END_OF_LIST(),
+    }
+};
+
+static Property tpm_spapr_properties[] = {
+    DEFINE_SPAPR_PROPERTIES(SpaprTpmState, vdev),
+    DEFINE_PROP_TPMBE("tpmdev", SpaprTpmState, be_driver),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void tpm_spapr_realizefn(SpaprVioDevice *dev, Error **errp)
+{
+    SpaprTpmState *s = VIO_SPAPR_VTPM(dev);
+
+    if (!tpm_find()) {
+        error_setg(errp, "at most one TPM device is permitted");
+        return;
+    }
+
+    dev->crq.SendFunc = tpm_spapr_do_crq;
+
+    if (!s->be_driver) {
+        error_setg(errp, "'tpmdev' property is required");
+        return;
+    }
+    s->buffer = g_malloc(TPM_SPAPR_BUFFER_MAX);
+}
+
+static void tpm_spapr_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    SpaprVioDeviceClass *k = VIO_SPAPR_DEVICE_CLASS(klass);
+    TPMIfClass *tc = TPM_IF_CLASS(klass);
+
+    k->realize = tpm_spapr_realizefn;
+    k->reset = tpm_spapr_reset;
+    k->dt_name = "vtpm";
+    k->dt_type = "IBM,vtpm";
+    k->get_dt_compatible = tpm_spapr_get_dt_compatible;
+    k->signal_mask = 0x00000001;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    device_class_set_props(dc, tpm_spapr_properties);
+    k->rtce_window_size = 0x10000000;
+    dc->vmsd = &vmstate_spapr_vtpm;
+
+    tc->model = TPM_MODEL_TPM_SPAPR;
+    tc->get_version = tpm_spapr_get_version;
+    tc->request_completed = tpm_spapr_request_completed;
+}
+
+static const TypeInfo tpm_spapr_info = {
+    .name          = TYPE_TPM_SPAPR,
+    .parent        = TYPE_VIO_SPAPR_DEVICE,
+    .instance_size = sizeof(SpaprTpmState),
+    .class_init    = tpm_spapr_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_TPM_IF },
+        { }
+    }
+};
+
+static void tpm_spapr_register_types(void)
+{
+    type_register_static(&tpm_spapr_info);
+}
+
+type_init(tpm_spapr_register_types)
diff --git a/hw/tpm/tpm_tis.c b/hw/tpm/tpm_tis.c
index 5362df2..31facb8 100644
--- a/hw/tpm/tpm_tis.c
+++ b/hw/tpm/tpm_tis.c
@@ -107,30 +107,6 @@
     return (uint8_t)((addr >> TPM_TIS_LOCALITY_SHIFT) & 0x7);
 }
 
-static void tpm_tis_show_buffer(const unsigned char *buffer,
-                                size_t buffer_size, const char *string)
-{
-    size_t len, i;
-    char *line_buffer, *p;
-
-    len = MIN(tpm_cmd_get_size(buffer), buffer_size);
-
-    /*
-     * allocate enough room for 3 chars per buffer entry plus a
-     * newline after every 16 chars and a final null terminator.
-     */
-    line_buffer = g_malloc(len * 3 + (len / 16) + 1);
-
-    for (i = 0, p = line_buffer; i < len; i++) {
-        if (i && !(i % 16)) {
-            p += sprintf(p, "\n");
-        }
-        p += sprintf(p, "%.2X ", buffer[i]);
-    }
-    trace_tpm_tis_show_buffer(string, len, line_buffer);
-
-    g_free(line_buffer);
-}
 
 /*
  * Set the given flags in the STS register by clearing the register but
@@ -156,8 +132,8 @@
  */
 static void tpm_tis_tpm_send(TPMState *s, uint8_t locty)
 {
-    if (trace_event_get_state_backends(TRACE_TPM_TIS_SHOW_BUFFER)) {
-        tpm_tis_show_buffer(s->buffer, s->be_buffer_size, "To TPM");
+    if (trace_event_get_state_backends(TRACE_TPM_UTIL_SHOW_BUFFER)) {
+        tpm_util_show_buffer(s->buffer, s->be_buffer_size, "To TPM");
     }
 
     /*
@@ -325,8 +301,8 @@
     s->loc[locty].state = TPM_TIS_STATE_COMPLETION;
     s->rw_offset = 0;
 
-    if (trace_event_get_state_backends(TRACE_TPM_TIS_SHOW_BUFFER)) {
-        tpm_tis_show_buffer(s->buffer, s->be_buffer_size, "From TPM");
+    if (trace_event_get_state_backends(TRACE_TPM_UTIL_SHOW_BUFFER)) {
+        tpm_util_show_buffer(s->buffer, s->be_buffer_size, "From TPM");
     }
 
     if (TPM_TIS_IS_VALID_LOCTY(s->next_locty)) {
diff --git a/hw/tpm/tpm_util.c b/hw/tpm/tpm_util.c
index 62b091f..c0a0f3d 100644
--- a/hw/tpm/tpm_util.c
+++ b/hw/tpm/tpm_util.c
@@ -350,3 +350,28 @@
     tsb->buffer = NULL;
     tsb->size = 0;
 }
+
+void tpm_util_show_buffer(const unsigned char *buffer,
+                          size_t buffer_size, const char *string)
+{
+    size_t len, i;
+    char *line_buffer, *p;
+
+    len = MIN(tpm_cmd_get_size(buffer), buffer_size);
+
+    /*
+     * allocate enough room for 3 chars per buffer entry plus a
+     * newline after every 16 chars and a final null terminator.
+     */
+    line_buffer = g_malloc(len * 3 + (len / 16) + 1);
+
+    for (i = 0, p = line_buffer; i < len; i++) {
+        if (i && !(i % 16)) {
+            p += sprintf(p, "\n");
+        }
+        p += sprintf(p, "%.2X ", buffer[i]);
+    }
+    trace_tpm_util_show_buffer(string, len, line_buffer);
+
+    g_free(line_buffer);
+}
diff --git a/hw/tpm/tpm_util.h b/hw/tpm/tpm_util.h
index f397ac2..7889081 100644
--- a/hw/tpm/tpm_util.h
+++ b/hw/tpm/tpm_util.h
@@ -79,4 +79,7 @@
 
 void tpm_sized_buffer_reset(TPMSizedBuffer *tsb);
 
+void tpm_util_show_buffer(const unsigned char *buffer,
+                          size_t buffer_size, const char *string);
+
 #endif /* TPM_TPM_UTIL_H */
diff --git a/hw/tpm/trace-events b/hw/tpm/trace-events
index 89804bc..439e514 100644
--- a/hw/tpm/trace-events
+++ b/hw/tpm/trace-events
@@ -14,6 +14,7 @@
 tpm_util_get_buffer_size_hdr_len2(uint32_t len, size_t expected) "tpm2_resp->hdr.len = %u, expected = %zu"
 tpm_util_get_buffer_size_len2(uint32_t len, size_t expected) "tpm2_resp->len = %u, expected = %zu"
 tpm_util_get_buffer_size(size_t len) "buffersize of device: %zu"
+tpm_util_show_buffer(const char *direction, size_t len, const char *buf) "direction: %s len: %zu\n%s"
 
 # tpm_emulator.c
 tpm_emulator_set_locality(uint8_t locty) "setting locality to %d"
@@ -36,7 +37,6 @@
 tpm_emulator_inst_init(void) ""
 
 # tpm_tis.c
-tpm_tis_show_buffer(const char *direction, size_t len, const char *buf) "direction: %s len: %zu\nbuf: %s"
 tpm_tis_raise_irq(uint32_t irqmask) "Raising IRQ for flag 0x%08x"
 tpm_tis_new_active_locality(uint8_t locty) "Active locality is now %d"
 tpm_tis_abort(uint8_t locty) "New active locality is %d"
@@ -55,3 +55,17 @@
 
 # tpm_ppi.c
 tpm_ppi_memset(uint8_t *ptr, size_t size) "memset: %p %zu"
+
+# hw/tpm/tpm_spapr.c
+tpm_spapr_show_buffer(const char *direction, size_t len, const char *buf) "direction: %s len: %zu\n%s"
+tpm_spapr_do_crq(uint8_t raw1, uint8_t raw2) "1st 2 bytes in CRQ: 0x%02x 0x%02x"
+tpm_spapr_do_crq_crq_result(void) "SPAPR_VTPM_INIT_CRQ_RESULT"
+tpm_spapr_do_crq_crq_complete_result(void) "SPAPR_VTPM_INIT_CRQ_COMP_RESULT"
+tpm_spapr_do_crq_tpm_command(void) "got TPM command payload"
+tpm_spapr_do_crq_tpm_get_rtce_buffer_size(size_t buffersize) "response: buffer size is %zu"
+tpm_spapr_do_crq_get_version(uint32_t version) "response: version %u"
+tpm_spapr_do_crq_prepare_to_suspend(void) "response: preparing to suspend"
+tpm_spapr_do_crq_unknown_msg_type(uint8_t type) "Unknown message type 0x%02x"
+tpm_spapr_do_crq_unknown_crq(uint8_t raw1, uint8_t raw2) "unknown CRQ 0x%02x 0x%02x ..."
+tpm_spapr_post_load(void) "Delivering TPM response after resume"
+tpm_spapr_caught_response(uint32_t v) "Caught response to deliver after resume: %u bytes"
diff --git a/include/hw/pci-host/pnv_phb3.h b/include/hw/pci-host/pnv_phb3.h
new file mode 100644
index 0000000..75b7878
--- /dev/null
+++ b/include/hw/pci-host/pnv_phb3.h
@@ -0,0 +1,164 @@
+/*
+ * QEMU PowerPC PowerNV (POWER8) PHB3 model
+ *
+ * Copyright (c) 2014-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef PCI_HOST_PNV_PHB3_H
+#define PCI_HOST_PNV_PHB3_H
+
+#include "hw/pci/pcie_host.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/ppc/xics.h"
+
+typedef struct PnvPHB3 PnvPHB3;
+
+/*
+ * PHB3 XICS Source for MSIs
+ */
+#define TYPE_PHB3_MSI "phb3-msi"
+#define PHB3_MSI(obj) OBJECT_CHECK(Phb3MsiState, (obj), TYPE_PHB3_MSI)
+
+#define PHB3_MAX_MSI     2048
+
+typedef struct Phb3MsiState {
+    ICSState ics;
+    qemu_irq *qirqs;
+
+    PnvPHB3 *phb;
+    uint64_t rba[PHB3_MAX_MSI / 64];
+    uint32_t rba_sum;
+} Phb3MsiState;
+
+void pnv_phb3_msi_update_config(Phb3MsiState *msis, uint32_t base,
+                                uint32_t count);
+void pnv_phb3_msi_send(Phb3MsiState *msis, uint64_t addr, uint16_t data,
+                       int32_t dev_pe);
+void pnv_phb3_msi_ffi(Phb3MsiState *msis, uint64_t val);
+void pnv_phb3_msi_pic_print_info(Phb3MsiState *msis, Monitor *mon);
+
+
+/*
+ * We have one such address space wrapper per possible device under
+ * the PHB since they need to be assigned statically at qemu device
+ * creation time. The relationship to a PE is done later dynamically.
+ * This means we can potentially create a lot of these guys. Q35
+ * stores them as some kind of radix tree but we never really need to
+ * do fast lookups so instead we simply keep a QLIST of them for now,
+ * we can add the radix if needed later on.
+ *
+ * We do cache the PE number to speed things up a bit though.
+ */
+typedef struct PnvPhb3DMASpace {
+    PCIBus *bus;
+    uint8_t devfn;
+    int pe_num;         /* Cached PE number */
+#define PHB_INVALID_PE (-1)
+    PnvPHB3 *phb;
+    AddressSpace dma_as;
+    IOMMUMemoryRegion dma_mr;
+    MemoryRegion msi32_mr;
+    MemoryRegion msi64_mr;
+    QLIST_ENTRY(PnvPhb3DMASpace) list;
+} PnvPhb3DMASpace;
+
+/*
+ * PHB3 Power Bus Common Queue
+ */
+#define TYPE_PNV_PBCQ "pnv-pbcq"
+#define PNV_PBCQ(obj) OBJECT_CHECK(PnvPBCQState, (obj), TYPE_PNV_PBCQ)
+
+typedef struct PnvPBCQState {
+    DeviceState parent;
+
+    uint32_t nest_xbase;
+    uint32_t spci_xbase;
+    uint32_t pci_xbase;
+#define PBCQ_NEST_REGS_COUNT    0x46
+#define PBCQ_PCI_REGS_COUNT     0x15
+#define PBCQ_SPCI_REGS_COUNT    0x5
+
+    uint64_t nest_regs[PBCQ_NEST_REGS_COUNT];
+    uint64_t spci_regs[PBCQ_SPCI_REGS_COUNT];
+    uint64_t pci_regs[PBCQ_PCI_REGS_COUNT];
+    MemoryRegion mmbar0;
+    MemoryRegion mmbar1;
+    MemoryRegion phbbar;
+    uint64_t mmio0_base;
+    uint64_t mmio0_size;
+    uint64_t mmio1_base;
+    uint64_t mmio1_size;
+    PnvPHB3 *phb;
+
+    MemoryRegion xscom_nest_regs;
+    MemoryRegion xscom_pci_regs;
+    MemoryRegion xscom_spci_regs;
+} PnvPBCQState;
+
+/*
+ * PHB3 PCIe Root port
+ */
+#define TYPE_PNV_PHB3_ROOT_BUS "pnv-phb3-root-bus"
+
+#define TYPE_PNV_PHB3_ROOT_PORT "pnv-phb3-root-port"
+
+typedef struct PnvPHB3RootPort {
+    PCIESlot parent_obj;
+} PnvPHB3RootPort;
+
+/*
+ * PHB3 PCIe Host Bridge for PowerNV machines (POWER8)
+ */
+#define TYPE_PNV_PHB3 "pnv-phb3"
+#define PNV_PHB3(obj) OBJECT_CHECK(PnvPHB3, (obj), TYPE_PNV_PHB3)
+
+#define PNV_PHB3_NUM_M64      16
+#define PNV_PHB3_NUM_REGS     (0x1000 >> 3)
+#define PNV_PHB3_NUM_LSI      8
+#define PNV_PHB3_NUM_PE       256
+
+#define PCI_MMIO_TOTAL_SIZE   (0x1ull << 60)
+
+struct PnvPHB3 {
+    PCIExpressHost parent_obj;
+
+    uint32_t chip_id;
+    uint32_t phb_id;
+    char bus_path[8];
+
+    uint64_t regs[PNV_PHB3_NUM_REGS];
+    MemoryRegion mr_regs;
+
+    MemoryRegion mr_m32;
+    MemoryRegion mr_m64[PNV_PHB3_NUM_M64];
+    MemoryRegion pci_mmio;
+    MemoryRegion pci_io;
+
+    uint64_t ioda_LIST[8];
+    uint64_t ioda_LXIVT[8];
+    uint64_t ioda_TVT[512];
+    uint64_t ioda_M64BT[16];
+    uint64_t ioda_MDT[256];
+    uint64_t ioda_PEEV[4];
+
+    uint32_t total_irq;
+    ICSState lsis;
+    qemu_irq *qirqs;
+    Phb3MsiState msis;
+
+    PnvPBCQState pbcq;
+
+    PnvPHB3RootPort root;
+
+    QLIST_HEAD(, PnvPhb3DMASpace) dma_spaces;
+};
+
+uint64_t pnv_phb3_reg_read(void *opaque, hwaddr off, unsigned size);
+void pnv_phb3_reg_write(void *opaque, hwaddr off, uint64_t val, unsigned size);
+void pnv_phb3_update_regions(PnvPHB3 *phb);
+void pnv_phb3_remap_irqs(PnvPHB3 *phb);
+
+#endif /* PCI_HOST_PNV_PHB3_H */
diff --git a/include/hw/pci-host/pnv_phb3_regs.h b/include/hw/pci-host/pnv_phb3_regs.h
new file mode 100644
index 0000000..a174ef1
--- /dev/null
+++ b/include/hw/pci-host/pnv_phb3_regs.h
@@ -0,0 +1,450 @@
+/*
+ * QEMU PowerPC PowerNV (POWER8) PHB3 model
+ *
+ * Copyright (c) 2013-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef PCI_HOST_PNV_PHB3_REGS_H
+#define PCI_HOST_PNV_PHB3_REGS_H
+
+#include "qemu/host-utils.h"
+
+/*
+ * QEMU version of the GETFIELD/SETFIELD macros
+ *
+ * These are common with the PnvXive model.
+ */
+static inline uint64_t GETFIELD(uint64_t mask, uint64_t word)
+{
+    return (word & mask) >> ctz64(mask);
+}
+
+static inline uint64_t SETFIELD(uint64_t mask, uint64_t word,
+                                uint64_t value)
+{
+    return (word & ~mask) | ((value << ctz64(mask)) & mask);
+}
+
+/*
+ * PBCQ XSCOM registers
+ */
+
+#define PBCQ_NEST_IRSN_COMPARE  0x1a
+#define PBCQ_NEST_IRSN_COMP           PPC_BITMASK(0, 18)
+#define PBCQ_NEST_IRSN_MASK     0x1b
+#define PBCQ_NEST_LSI_SRC_ID    0x1f
+#define   PBCQ_NEST_LSI_SRC           PPC_BITMASK(0, 7)
+#define PBCQ_NEST_REGS_COUNT    0x46
+#define PBCQ_NEST_MMIO_BAR0     0x40
+#define PBCQ_NEST_MMIO_BAR1     0x41
+#define PBCQ_NEST_PHB_BAR       0x42
+#define PBCQ_NEST_MMIO_MASK0    0x43
+#define PBCQ_NEST_MMIO_MASK1    0x44
+#define PBCQ_NEST_BAR_EN        0x45
+#define   PBCQ_NEST_BAR_EN_MMIO0    PPC_BIT(0)
+#define   PBCQ_NEST_BAR_EN_MMIO1    PPC_BIT(1)
+#define   PBCQ_NEST_BAR_EN_PHB      PPC_BIT(2)
+#define   PBCQ_NEST_BAR_EN_IRSN_RX  PPC_BIT(3)
+#define   PBCQ_NEST_BAR_EN_IRSN_TX  PPC_BIT(4)
+
+#define PBCQ_PCI_REGS_COUNT     0x15
+#define PBCQ_PCI_BAR2           0x0b
+
+#define PBCQ_SPCI_REGS_COUNT    0x5
+#define PBCQ_SPCI_ASB_ADDR      0x0
+#define PBCQ_SPCI_ASB_STATUS    0x1
+#define PBCQ_SPCI_ASB_DATA      0x2
+#define PBCQ_SPCI_AIB_CAPP_EN   0x3
+#define PBCQ_SPCI_CAPP_SEC_TMR  0x4
+
+/*
+ * PHB MMIO registers
+ */
+
+/* PHB Fundamental register set A */
+#define PHB_LSI_SOURCE_ID               0x100
+#define   PHB_LSI_SRC_ID                PPC_BITMASK(5, 12)
+#define PHB_DMA_CHAN_STATUS             0x110
+#define   PHB_DMA_CHAN_ANY_ERR          PPC_BIT(27)
+#define   PHB_DMA_CHAN_ANY_ERR1         PPC_BIT(28)
+#define   PHB_DMA_CHAN_ANY_FREEZE       PPC_BIT(29)
+#define PHB_CPU_LOADSTORE_STATUS        0x120
+#define   PHB_CPU_LS_ANY_ERR            PPC_BIT(27)
+#define   PHB_CPU_LS_ANY_ERR1           PPC_BIT(28)
+#define   PHB_CPU_LS_ANY_FREEZE         PPC_BIT(29)
+#define PHB_DMA_MSI_NODE_ID             0x128
+#define   PHB_DMAMSI_NID_FIXED          PPC_BIT(0)
+#define   PHB_DMAMSI_NID                PPC_BITMASK(24, 31)
+#define PHB_CONFIG_DATA                 0x130
+#define PHB_LOCK0                       0x138
+#define PHB_CONFIG_ADDRESS              0x140
+#define   PHB_CA_ENABLE                 PPC_BIT(0)
+#define   PHB_CA_BUS                    PPC_BITMASK(4, 11)
+#define   PHB_CA_DEV                    PPC_BITMASK(12, 16)
+#define   PHB_CA_FUNC                   PPC_BITMASK(17, 19)
+#define   PHB_CA_REG                    PPC_BITMASK(20, 31)
+#define   PHB_CA_PE                     PPC_BITMASK(40, 47)
+#define PHB_LOCK1                       0x148
+#define PHB_IVT_BAR                     0x150
+#define   PHB_IVT_BAR_ENABLE            PPC_BIT(0)
+#define   PHB_IVT_BASE_ADDRESS_MASK     PPC_BITMASK(14, 48)
+#define   PHB_IVT_LENGTH_MASK           PPC_BITMASK(52, 63)
+#define PHB_RBA_BAR                     0x158
+#define   PHB_RBA_BAR_ENABLE            PPC_BIT(0)
+#define   PHB_RBA_BASE_ADDRESS          PPC_BITMASK(14, 55)
+#define PHB_PHB3_CONFIG                 0x160
+#define   PHB_PHB3C_64B_TCE_EN          PPC_BIT(2)
+#define   PHB_PHB3C_32BIT_MSI_EN        PPC_BIT(8)
+#define   PHB_PHB3C_64BIT_MSI_EN        PPC_BIT(14)
+#define   PHB_PHB3C_M32_EN              PPC_BIT(16)
+#define PHB_RTT_BAR                     0x168
+#define   PHB_RTT_BAR_ENABLE            PPC_BIT(0)
+#define   PHB_RTT_BASE_ADDRESS_MASK     PPC_BITMASK(14, 46)
+#define PHB_PELTV_BAR                   0x188
+#define   PHB_PELTV_BAR_ENABLE          PPC_BIT(0)
+#define   PHB_PELTV_BASE_ADDRESS        PPC_BITMASK(14, 50)
+#define PHB_M32_BASE_ADDR               0x190
+#define PHB_M32_BASE_MASK               0x198
+#define PHB_M32_START_ADDR              0x1a0
+#define PHB_PEST_BAR                    0x1a8
+#define   PHB_PEST_BAR_ENABLE           PPC_BIT(0)
+#define   PHB_PEST_BASE_ADDRESS         PPC_BITMASK(14, 51)
+#define PHB_M64_UPPER_BITS              0x1f0
+#define PHB_INTREP_TIMER                0x1f8
+#define PHB_DMARD_SYNC                  0x200
+#define   PHB_DMARD_SYNC_START          PPC_BIT(0)
+#define   PHB_DMARD_SYNC_COMPLETE       PPC_BIT(1)
+#define PHB_RTC_INVALIDATE              0x208
+#define   PHB_RTC_INVALIDATE_ALL        PPC_BIT(0)
+#define   PHB_RTC_INVALIDATE_RID        PPC_BITMASK(16, 31)
+#define PHB_TCE_KILL                    0x210
+#define   PHB_TCE_KILL_ALL              PPC_BIT(0)
+#define PHB_TCE_SPEC_CTL                0x218
+#define PHB_IODA_ADDR                   0x220
+#define   PHB_IODA_AD_AUTOINC           PPC_BIT(0)
+#define   PHB_IODA_AD_TSEL              PPC_BITMASK(11, 15)
+#define   PHB_IODA_AD_TADR              PPC_BITMASK(55, 63)
+#define PHB_IODA_DATA0                  0x228
+#define PHB_FFI_REQUEST                 0x238
+#define   PHB_FFI_LOCK_CLEAR            PPC_BIT(3)
+#define   PHB_FFI_REQUEST_ISN           PPC_BITMASK(49, 59)
+#define PHB_FFI_LOCK                    0x240
+#define   PHB_FFI_LOCK_STATE            PPC_BIT(0)
+#define PHB_XIVE_UPDATE                 0x248 /* Broken in DD1 */
+#define PHB_PHB3_GEN_CAP                0x250
+#define PHB_PHB3_TCE_CAP                0x258
+#define PHB_PHB3_IRQ_CAP                0x260
+#define PHB_PHB3_EEH_CAP                0x268
+#define PHB_IVC_INVALIDATE              0x2a0
+#define   PHB_IVC_INVALIDATE_ALL        PPC_BIT(0)
+#define   PHB_IVC_INVALIDATE_SID        PPC_BITMASK(16, 31)
+#define PHB_IVC_UPDATE                  0x2a8
+#define   PHB_IVC_UPDATE_ENABLE_P       PPC_BIT(0)
+#define   PHB_IVC_UPDATE_ENABLE_Q       PPC_BIT(1)
+#define   PHB_IVC_UPDATE_ENABLE_SERVER  PPC_BIT(2)
+#define   PHB_IVC_UPDATE_ENABLE_PRI     PPC_BIT(3)
+#define   PHB_IVC_UPDATE_ENABLE_GEN     PPC_BIT(4)
+#define   PHB_IVC_UPDATE_ENABLE_CON     PPC_BIT(5)
+#define   PHB_IVC_UPDATE_GEN_MATCH      PPC_BITMASK(6, 7)
+#define   PHB_IVC_UPDATE_SERVER         PPC_BITMASK(8, 23)
+#define   PHB_IVC_UPDATE_PRI            PPC_BITMASK(24, 31)
+#define   PHB_IVC_UPDATE_GEN            PPC_BITMASK(32, 33)
+#define   PHB_IVC_UPDATE_P              PPC_BITMASK(34, 34)
+#define   PHB_IVC_UPDATE_Q              PPC_BITMASK(35, 35)
+#define   PHB_IVC_UPDATE_SID            PPC_BITMASK(48, 63)
+#define PHB_PAPR_ERR_INJ_CTL            0x2b0
+#define   PHB_PAPR_ERR_INJ_CTL_INB      PPC_BIT(0)
+#define   PHB_PAPR_ERR_INJ_CTL_OUTB     PPC_BIT(1)
+#define   PHB_PAPR_ERR_INJ_CTL_STICKY   PPC_BIT(2)
+#define   PHB_PAPR_ERR_INJ_CTL_CFG      PPC_BIT(3)
+#define   PHB_PAPR_ERR_INJ_CTL_RD       PPC_BIT(4)
+#define   PHB_PAPR_ERR_INJ_CTL_WR       PPC_BIT(5)
+#define   PHB_PAPR_ERR_INJ_CTL_FREEZE   PPC_BIT(6)
+#define PHB_PAPR_ERR_INJ_ADDR           0x2b8
+#define   PHB_PAPR_ERR_INJ_ADDR_MMIO            PPC_BITMASK(16, 63)
+#define PHB_PAPR_ERR_INJ_MASK           0x2c0
+#define   PHB_PAPR_ERR_INJ_MASK_CFG             PPC_BITMASK(4, 11)
+#define   PHB_PAPR_ERR_INJ_MASK_MMIO            PPC_BITMASK(16, 63)
+#define PHB_ETU_ERR_SUMMARY             0x2c8
+
+/*  UTL registers */
+#define UTL_SYS_BUS_CONTROL             0x400
+#define UTL_STATUS                      0x408
+#define UTL_SYS_BUS_AGENT_STATUS        0x410
+#define UTL_SYS_BUS_AGENT_ERR_SEVERITY  0x418
+#define UTL_SYS_BUS_AGENT_IRQ_EN        0x420
+#define UTL_SYS_BUS_BURST_SZ_CONF       0x440
+#define UTL_REVISION_ID                 0x448
+#define UTL_BCLK_DOMAIN_DBG1            0x460
+#define UTL_BCLK_DOMAIN_DBG2            0x468
+#define UTL_BCLK_DOMAIN_DBG3            0x470
+#define UTL_BCLK_DOMAIN_DBG4            0x478
+#define UTL_BCLK_DOMAIN_DBG5            0x480
+#define UTL_BCLK_DOMAIN_DBG6            0x488
+#define UTL_OUT_POST_HDR_BUF_ALLOC      0x4c0
+#define UTL_OUT_POST_DAT_BUF_ALLOC      0x4d0
+#define UTL_IN_POST_HDR_BUF_ALLOC       0x4e0
+#define UTL_IN_POST_DAT_BUF_ALLOC       0x4f0
+#define UTL_OUT_NP_BUF_ALLOC            0x500
+#define UTL_IN_NP_BUF_ALLOC             0x510
+#define UTL_PCIE_TAGS_ALLOC             0x520
+#define UTL_GBIF_READ_TAGS_ALLOC        0x530
+#define UTL_PCIE_PORT_CONTROL           0x540
+#define UTL_PCIE_PORT_STATUS            0x548
+#define UTL_PCIE_PORT_ERROR_SEV         0x550
+#define UTL_PCIE_PORT_IRQ_EN            0x558
+#define UTL_RC_STATUS                   0x560
+#define UTL_RC_ERR_SEVERITY             0x568
+#define UTL_RC_IRQ_EN                   0x570
+#define UTL_EP_STATUS                   0x578
+#define UTL_EP_ERR_SEVERITY             0x580
+#define UTL_EP_ERR_IRQ_EN               0x588
+#define UTL_PCI_PM_CTRL1                0x590
+#define UTL_PCI_PM_CTRL2                0x598
+#define UTL_GP_CTL1                     0x5a0
+#define UTL_GP_CTL2                     0x5a8
+#define UTL_PCLK_DOMAIN_DBG1            0x5b0
+#define UTL_PCLK_DOMAIN_DBG2            0x5b8
+#define UTL_PCLK_DOMAIN_DBG3            0x5c0
+#define UTL_PCLK_DOMAIN_DBG4            0x5c8
+
+/* PCI-E Stack registers */
+#define PHB_PCIE_SYSTEM_CONFIG          0x600
+#define PHB_PCIE_BUS_NUMBER             0x608
+#define PHB_PCIE_SYSTEM_TEST            0x618
+#define PHB_PCIE_LINK_MANAGEMENT        0x630
+#define   PHB_PCIE_LM_LINK_ACTIVE       PPC_BIT(8)
+#define PHB_PCIE_DLP_TRAIN_CTL          0x640
+#define   PHB_PCIE_DLP_TCTX_DISABLE     PPC_BIT(1)
+#define   PHB_PCIE_DLP_TCRX_DISABLED    PPC_BIT(16)
+#define   PHB_PCIE_DLP_INBAND_PRESENCE  PPC_BIT(19)
+#define   PHB_PCIE_DLP_TC_DL_LINKUP     PPC_BIT(21)
+#define   PHB_PCIE_DLP_TC_DL_PGRESET    PPC_BIT(22)
+#define   PHB_PCIE_DLP_TC_DL_LINKACT    PPC_BIT(23)
+#define PHB_PCIE_SLOP_LOOPBACK_STATUS   0x648
+#define PHB_PCIE_SYS_LINK_INIT          0x668
+#define PHB_PCIE_UTL_CONFIG             0x670
+#define PHB_PCIE_DLP_CONTROL            0x678
+#define PHB_PCIE_UTL_ERRLOG1            0x680
+#define PHB_PCIE_UTL_ERRLOG2            0x688
+#define PHB_PCIE_UTL_ERRLOG3            0x690
+#define PHB_PCIE_UTL_ERRLOG4            0x698
+#define PHB_PCIE_DLP_ERRLOG1            0x6a0
+#define PHB_PCIE_DLP_ERRLOG2            0x6a8
+#define PHB_PCIE_DLP_ERR_STATUS         0x6b0
+#define PHB_PCIE_DLP_ERR_COUNTERS       0x6b8
+#define PHB_PCIE_UTL_ERR_INJECT         0x6c0
+#define PHB_PCIE_TLDLP_ERR_INJECT       0x6c8
+#define PHB_PCIE_LANE_EQ_CNTL0          0x6d0
+#define PHB_PCIE_LANE_EQ_CNTL1          0x6d8
+#define PHB_PCIE_LANE_EQ_CNTL2          0x6e0
+#define PHB_PCIE_LANE_EQ_CNTL3          0x6e8
+#define PHB_PCIE_STRAPPING              0x700
+
+/* Fundamental register set B */
+#define PHB_VERSION                     0x800
+#define PHB_RESET                       0x808
+#define PHB_CONTROL                     0x810
+#define   PHB_CTRL_IVE_128_BYTES        PPC_BIT(24)
+#define PHB_AIB_RX_CRED_INIT_TIMER      0x818
+#define PHB_AIB_RX_CMD_CRED             0x820
+#define PHB_AIB_RX_DATA_CRED            0x828
+#define PHB_AIB_TX_CMD_CRED             0x830
+#define PHB_AIB_TX_DATA_CRED            0x838
+#define PHB_AIB_TX_CHAN_MAPPING         0x840
+#define PHB_AIB_TAG_ENABLE              0x858
+#define PHB_AIB_FENCE_CTRL              0x860
+#define PHB_TCE_TAG_ENABLE              0x868
+#define PHB_TCE_WATERMARK               0x870
+#define PHB_TIMEOUT_CTRL1               0x878
+#define PHB_TIMEOUT_CTRL2               0x880
+#define PHB_Q_DMA_R                     0x888
+#define   PHB_Q_DMA_R_QUIESCE_DMA       PPC_BIT(0)
+#define   PHB_Q_DMA_R_AUTORESET         PPC_BIT(1)
+#define   PHB_Q_DMA_R_DMA_RESP_STATUS   PPC_BIT(4)
+#define   PHB_Q_DMA_R_MMIO_RESP_STATUS  PPC_BIT(5)
+#define   PHB_Q_DMA_R_TCE_RESP_STATUS   PPC_BIT(6)
+#define PHB_AIB_TAG_STATUS              0x900
+#define PHB_TCE_TAG_STATUS              0x908
+
+/* FIR & Error registers */
+#define PHB_LEM_FIR_ACCUM               0xc00
+#define PHB_LEM_FIR_AND_MASK            0xc08
+#define PHB_LEM_FIR_OR_MASK             0xc10
+#define PHB_LEM_ERROR_MASK              0xc18
+#define PHB_LEM_ERROR_AND_MASK          0xc20
+#define PHB_LEM_ERROR_OR_MASK           0xc28
+#define PHB_LEM_ACTION0                 0xc30
+#define PHB_LEM_ACTION1                 0xc38
+#define PHB_LEM_WOF                     0xc40
+#define PHB_ERR_STATUS                  0xc80
+#define PHB_ERR1_STATUS                 0xc88
+#define PHB_ERR_INJECT                  0xc90
+#define PHB_ERR_LEM_ENABLE              0xc98
+#define PHB_ERR_IRQ_ENABLE              0xca0
+#define PHB_ERR_FREEZE_ENABLE           0xca8
+#define PHB_ERR_AIB_FENCE_ENABLE        0xcb0
+#define PHB_ERR_LOG_0                   0xcc0
+#define PHB_ERR_LOG_1                   0xcc8
+#define PHB_ERR_STATUS_MASK             0xcd0
+#define PHB_ERR1_STATUS_MASK            0xcd8
+
+#define PHB_OUT_ERR_STATUS              0xd00
+#define PHB_OUT_ERR1_STATUS             0xd08
+#define PHB_OUT_ERR_INJECT              0xd10
+#define PHB_OUT_ERR_LEM_ENABLE          0xd18
+#define PHB_OUT_ERR_IRQ_ENABLE          0xd20
+#define PHB_OUT_ERR_FREEZE_ENABLE       0xd28
+#define PHB_OUT_ERR_AIB_FENCE_ENABLE    0xd30
+#define PHB_OUT_ERR_LOG_0               0xd40
+#define PHB_OUT_ERR_LOG_1               0xd48
+#define PHB_OUT_ERR_STATUS_MASK         0xd50
+#define PHB_OUT_ERR1_STATUS_MASK        0xd58
+
+#define PHB_INA_ERR_STATUS              0xd80
+#define PHB_INA_ERR1_STATUS             0xd88
+#define PHB_INA_ERR_INJECT              0xd90
+#define PHB_INA_ERR_LEM_ENABLE          0xd98
+#define PHB_INA_ERR_IRQ_ENABLE          0xda0
+#define PHB_INA_ERR_FREEZE_ENABLE       0xda8
+#define PHB_INA_ERR_AIB_FENCE_ENABLE    0xdb0
+#define PHB_INA_ERR_LOG_0               0xdc0
+#define PHB_INA_ERR_LOG_1               0xdc8
+#define PHB_INA_ERR_STATUS_MASK         0xdd0
+#define PHB_INA_ERR1_STATUS_MASK        0xdd8
+
+#define PHB_INB_ERR_STATUS              0xe00
+#define PHB_INB_ERR1_STATUS             0xe08
+#define PHB_INB_ERR_INJECT              0xe10
+#define PHB_INB_ERR_LEM_ENABLE          0xe18
+#define PHB_INB_ERR_IRQ_ENABLE          0xe20
+#define PHB_INB_ERR_FREEZE_ENABLE       0xe28
+#define PHB_INB_ERR_AIB_FENCE_ENABLE    0xe30
+#define PHB_INB_ERR_LOG_0               0xe40
+#define PHB_INB_ERR_LOG_1               0xe48
+#define PHB_INB_ERR_STATUS_MASK         0xe50
+#define PHB_INB_ERR1_STATUS_MASK        0xe58
+
+/* Performance monitor & Debug registers */
+#define PHB_TRACE_CONTROL               0xf80
+#define PHB_PERFMON_CONFIG              0xf88
+#define PHB_PERFMON_CTR0                0xf90
+#define PHB_PERFMON_CTR1                0xf98
+#define PHB_PERFMON_CTR2                0xfa0
+#define PHB_PERFMON_CTR3                0xfa8
+#define PHB_HOTPLUG_OVERRIDE            0xfb0
+#define   PHB_HPOVR_FORCE_RESAMPLE      PPC_BIT(9)
+#define   PHB_HPOVR_PRESENCE_A          PPC_BIT(10)
+#define   PHB_HPOVR_PRESENCE_B          PPC_BIT(11)
+#define   PHB_HPOVR_LINK_ACTIVE         PPC_BIT(12)
+#define   PHB_HPOVR_LINK_BIFURCATED     PPC_BIT(13)
+#define   PHB_HPOVR_LINK_LANE_SWAPPED   PPC_BIT(14)
+
+/*
+ * IODA2 on-chip tables
+ */
+
+#define IODA2_TBL_LIST          1
+#define IODA2_TBL_LXIVT         2
+#define IODA2_TBL_IVC_CAM       3
+#define IODA2_TBL_RBA           4
+#define IODA2_TBL_RCAM          5
+#define IODA2_TBL_MRT           6
+#define IODA2_TBL_PESTA         7
+#define IODA2_TBL_PESTB         8
+#define IODA2_TBL_TVT           9
+#define IODA2_TBL_TCAM          10
+#define IODA2_TBL_TDR           11
+#define IODA2_TBL_M64BT         16
+#define IODA2_TBL_M32DT         17
+#define IODA2_TBL_PEEV          20
+
+/* LXIVT */
+#define IODA2_LXIVT_SERVER              PPC_BITMASK(8, 23)
+#define IODA2_LXIVT_PRIORITY            PPC_BITMASK(24, 31)
+#define IODA2_LXIVT_NODE_ID             PPC_BITMASK(56, 63)
+
+/* IVT */
+#define IODA2_IVT_SERVER                PPC_BITMASK(0, 23)
+#define IODA2_IVT_PRIORITY              PPC_BITMASK(24, 31)
+#define IODA2_IVT_GEN                   PPC_BITMASK(37, 38)
+#define IODA2_IVT_P                     PPC_BITMASK(39, 39)
+#define IODA2_IVT_Q                     PPC_BITMASK(47, 47)
+#define IODA2_IVT_PE                    PPC_BITMASK(48, 63)
+
+/* TVT */
+#define IODA2_TVT_TABLE_ADDR            PPC_BITMASK(0, 47)
+#define IODA2_TVT_NUM_LEVELS            PPC_BITMASK(48, 50)
+#define   IODA2_TVE_1_LEVEL     0
+#define   IODA2_TVE_2_LEVELS    1
+#define   IODA2_TVE_3_LEVELS    2
+#define   IODA2_TVE_4_LEVELS    3
+#define   IODA2_TVE_5_LEVELS    4
+#define IODA2_TVT_TCE_TABLE_SIZE        PPC_BITMASK(51, 55)
+#define IODA2_TVT_IO_PSIZE              PPC_BITMASK(59, 63)
+
+/* PESTA */
+#define IODA2_PESTA_MMIO_FROZEN         PPC_BIT(0)
+
+/* PESTB */
+#define IODA2_PESTB_DMA_STOPPED         PPC_BIT(0)
+
+/* M32DT */
+#define IODA2_M32DT_PE                  PPC_BITMASK(8, 15)
+
+/* M64BT */
+#define IODA2_M64BT_ENABLE              PPC_BIT(0)
+#define IODA2_M64BT_SINGLE_PE           PPC_BIT(1)
+#define IODA2_M64BT_BASE                PPC_BITMASK(2, 31)
+#define IODA2_M64BT_MASK                PPC_BITMASK(34, 63)
+#define IODA2_M64BT_SINGLE_BASE         PPC_BITMASK(2, 26)
+#define IODA2_M64BT_PE_HI               PPC_BITMASK(27, 31)
+#define IODA2_M64BT_SINGLE_MASK         PPC_BITMASK(34, 58)
+#define IODA2_M64BT_PE_LOW              PPC_BITMASK(59, 63)
+
+/*
+ * IODA2 in-memory tables
+ */
+
+/*
+ * PEST
+ *
+ * 2x8 bytes entries, PEST0 and PEST1
+ */
+
+#define IODA2_PEST0_MMIO_CAUSE          PPC_BIT(2)
+#define IODA2_PEST0_CFG_READ            PPC_BIT(3)
+#define IODA2_PEST0_CFG_WRITE           PPC_BIT(4)
+#define IODA2_PEST0_TTYPE               PPC_BITMASK(5, 7)
+#define   PEST_TTYPE_DMA_WRITE          0
+#define   PEST_TTYPE_MSI                1
+#define   PEST_TTYPE_DMA_READ           2
+#define   PEST_TTYPE_DMA_READ_RESP      3
+#define   PEST_TTYPE_MMIO_LOAD          4
+#define   PEST_TTYPE_MMIO_STORE         5
+#define   PEST_TTYPE_OTHER              7
+#define IODA2_PEST0_CA_RETURN           PPC_BIT(8)
+#define IODA2_PEST0_UTL_RTOS_TIMEOUT    PPC_BIT(8) /* Same bit as CA return */
+#define IODA2_PEST0_UR_RETURN           PPC_BIT(9)
+#define IODA2_PEST0_UTL_NONFATAL        PPC_BIT(10)
+#define IODA2_PEST0_UTL_FATAL           PPC_BIT(11)
+#define IODA2_PEST0_PARITY_UE           PPC_BIT(13)
+#define IODA2_PEST0_UTL_CORRECTABLE     PPC_BIT(14)
+#define IODA2_PEST0_UTL_INTERRUPT       PPC_BIT(15)
+#define IODA2_PEST0_MMIO_XLATE          PPC_BIT(16)
+#define IODA2_PEST0_IODA2_ERROR         PPC_BIT(16) /* Same bit as MMIO xlate */
+#define IODA2_PEST0_TCE_PAGE_FAULT      PPC_BIT(18)
+#define IODA2_PEST0_TCE_ACCESS_FAULT    PPC_BIT(19)
+#define IODA2_PEST0_DMA_RESP_TIMEOUT    PPC_BIT(20)
+#define IODA2_PEST0_AIB_SIZE_INVALID    PPC_BIT(21)
+#define IODA2_PEST0_LEM_BIT             PPC_BITMASK(26, 31)
+#define IODA2_PEST0_RID                 PPC_BITMASK(32, 47)
+#define IODA2_PEST0_MSI_DATA            PPC_BITMASK(48, 63)
+
+#define IODA2_PEST1_FAIL_ADDR           PPC_BITMASK(3, 63)
+
+
+#endif /* PCI_HOST_PNV_PHB3_REGS_H */
diff --git a/include/hw/pci-host/pnv_phb4.h b/include/hw/pci-host/pnv_phb4.h
new file mode 100644
index 0000000..c882bfd
--- /dev/null
+++ b/include/hw/pci-host/pnv_phb4.h
@@ -0,0 +1,230 @@
+/*
+ * QEMU PowerPC PowerNV (POWER9) PHB4 model
+ *
+ * Copyright (c) 2018-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef PCI_HOST_PNV_PHB4_H
+#define PCI_HOST_PNV_PHB4_H
+
+#include "hw/pci/pcie_host.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/ppc/xive.h"
+
+typedef struct PnvPhb4PecState PnvPhb4PecState;
+typedef struct PnvPhb4PecStack PnvPhb4PecStack;
+typedef struct PnvPHB4 PnvPHB4;
+typedef struct PnvChip PnvChip;
+
+/*
+ * We have one such address space wrapper per possible device under
+ * the PHB since they need to be assigned statically at qemu device
+ * creation time. The relationship to a PE is done later
+ * dynamically. This means we can potentially create a lot of these
+ * guys. Q35 stores them as some kind of radix tree but we never
+ * really need to do fast lookups so instead we simply keep a QLIST of
+ * them for now, we can add the radix if needed later on.
+ *
+ * We do cache the PE number to speed things up a bit though.
+ */
+typedef struct PnvPhb4DMASpace {
+    PCIBus *bus;
+    uint8_t devfn;
+    int pe_num;         /* Cached PE number */
+#define PHB_INVALID_PE (-1)
+    PnvPHB4 *phb;
+    AddressSpace dma_as;
+    IOMMUMemoryRegion dma_mr;
+    MemoryRegion msi32_mr;
+    MemoryRegion msi64_mr;
+    QLIST_ENTRY(PnvPhb4DMASpace) list;
+} PnvPhb4DMASpace;
+
+/*
+ * PHB4 PCIe Root port
+ */
+#define TYPE_PNV_PHB4_ROOT_BUS "pnv-phb4-root-bus"
+#define TYPE_PNV_PHB4_ROOT_PORT "pnv-phb4-root-port"
+
+typedef struct PnvPHB4RootPort {
+    PCIESlot parent_obj;
+} PnvPHB4RootPort;
+
+/*
+ * PHB4 PCIe Host Bridge for PowerNV machines (POWER9)
+ */
+#define TYPE_PNV_PHB4 "pnv-phb4"
+#define PNV_PHB4(obj) OBJECT_CHECK(PnvPHB4, (obj), TYPE_PNV_PHB4)
+
+#define PNV_PHB4_MAX_LSIs          8
+#define PNV_PHB4_MAX_INTs          4096
+#define PNV_PHB4_MAX_MIST          (PNV_PHB4_MAX_INTs >> 2)
+#define PNV_PHB4_MAX_MMIO_WINDOWS  32
+#define PNV_PHB4_MIN_MMIO_WINDOWS  16
+#define PNV_PHB4_NUM_REGS          (0x3000 >> 3)
+#define PNV_PHB4_MAX_PEs           512
+#define PNV_PHB4_MAX_TVEs          (PNV_PHB4_MAX_PEs * 2)
+#define PNV_PHB4_MAX_PEEVs         (PNV_PHB4_MAX_PEs / 64)
+#define PNV_PHB4_MAX_MBEs          (PNV_PHB4_MAX_MMIO_WINDOWS * 2)
+
+#define PNV_PHB4_VERSION           0x000000a400000002ull
+#define PNV_PHB4_DEVICE_ID         0x04c1
+
+#define PCI_MMIO_TOTAL_SIZE        (0x1ull << 60)
+
+struct PnvPHB4 {
+    PCIExpressHost parent_obj;
+
+    PnvPHB4RootPort root;
+
+    uint32_t chip_id;
+    uint32_t phb_id;
+
+    uint64_t version;
+    uint16_t device_id;
+
+    char bus_path[8];
+
+    /* Main register images */
+    uint64_t regs[PNV_PHB4_NUM_REGS];
+    MemoryRegion mr_regs;
+
+    /* Extra SCOM-only register */
+    uint64_t scom_hv_ind_addr_reg;
+
+    /*
+     * Geometry of the PHB. There are two types, small and big PHBs, a
+     * number of resources (number of PEs, windows etc...) are doubled
+     * for a big PHB
+     */
+    bool big_phb;
+
+    /* Memory regions for MMIO space */
+    MemoryRegion mr_mmio[PNV_PHB4_MAX_MMIO_WINDOWS];
+
+    /* PCI side space */
+    MemoryRegion pci_mmio;
+    MemoryRegion pci_io;
+
+    /* On-chip IODA tables */
+    uint64_t ioda_LIST[PNV_PHB4_MAX_LSIs];
+    uint64_t ioda_MIST[PNV_PHB4_MAX_MIST];
+    uint64_t ioda_TVT[PNV_PHB4_MAX_TVEs];
+    uint64_t ioda_MBT[PNV_PHB4_MAX_MBEs];
+    uint64_t ioda_MDT[PNV_PHB4_MAX_PEs];
+    uint64_t ioda_PEEV[PNV_PHB4_MAX_PEEVs];
+
+    /*
+     * The internal PESTA/B is 2 bits per PE split into two tables, we
+     * store them in a single array here to avoid wasting space.
+     */
+    uint8_t  ioda_PEST_AB[PNV_PHB4_MAX_PEs];
+
+    /* P9 Interrupt generation */
+    XiveSource xsrc;
+    qemu_irq *qirqs;
+
+    PnvPhb4PecStack *stack;
+
+    QLIST_HEAD(, PnvPhb4DMASpace) dma_spaces;
+};
+
+void pnv_phb4_pic_print_info(PnvPHB4 *phb, Monitor *mon);
+void pnv_phb4_update_regions(PnvPhb4PecStack *stack);
+extern const MemoryRegionOps pnv_phb4_xscom_ops;
+
+/*
+ * PHB4 PEC (PCI Express Controller)
+ */
+#define TYPE_PNV_PHB4_PEC "pnv-phb4-pec"
+#define PNV_PHB4_PEC(obj) \
+    OBJECT_CHECK(PnvPhb4PecState, (obj), TYPE_PNV_PHB4_PEC)
+
+#define TYPE_PNV_PHB4_PEC_STACK "pnv-phb4-pec-stack"
+#define PNV_PHB4_PEC_STACK(obj) \
+    OBJECT_CHECK(PnvPhb4PecStack, (obj), TYPE_PNV_PHB4_PEC_STACK)
+
+/* Per-stack data */
+struct PnvPhb4PecStack {
+    DeviceState parent;
+
+    /* My own stack number */
+    uint32_t stack_no;
+
+    /* Nest registers */
+#define PHB4_PEC_NEST_STK_REGS_COUNT  0x17
+    uint64_t nest_regs[PHB4_PEC_NEST_STK_REGS_COUNT];
+    MemoryRegion nest_regs_mr;
+
+    /* PCI registers (excluding pass-through) */
+#define PHB4_PEC_PCI_STK_REGS_COUNT  0xf
+    uint64_t pci_regs[PHB4_PEC_PCI_STK_REGS_COUNT];
+    MemoryRegion pci_regs_mr;
+
+    /* PHB pass-through XSCOM */
+    MemoryRegion phb_regs_mr;
+
+    /* Memory windows from PowerBus to PHB */
+    MemoryRegion mmbar0;
+    MemoryRegion mmbar1;
+    MemoryRegion phbbar;
+    MemoryRegion intbar;
+    uint64_t mmio0_base;
+    uint64_t mmio0_size;
+    uint64_t mmio1_base;
+    uint64_t mmio1_size;
+
+    /* The owner PEC */
+    PnvPhb4PecState *pec;
+
+    /* The actual PHB */
+    PnvPHB4 phb;
+};
+
+struct PnvPhb4PecState {
+    DeviceState parent;
+
+    /* PEC number in chip */
+    uint32_t index;
+    uint32_t chip_id;
+
+    MemoryRegion *system_memory;
+
+    /* Nest registers, excuding per-stack */
+#define PHB4_PEC_NEST_REGS_COUNT    0xf
+    uint64_t nest_regs[PHB4_PEC_NEST_REGS_COUNT];
+    MemoryRegion nest_regs_mr;
+
+    /* PCI registers, excluding per-stack */
+#define PHB4_PEC_PCI_REGS_COUNT     0x2
+    uint64_t pci_regs[PHB4_PEC_PCI_REGS_COUNT];
+    MemoryRegion pci_regs_mr;
+
+    /* Stacks */
+    #define PHB4_PEC_MAX_STACKS     3
+    uint32_t num_stacks;
+    PnvPhb4PecStack stacks[PHB4_PEC_MAX_STACKS];
+};
+
+#define PNV_PHB4_PEC_CLASS(klass) \
+     OBJECT_CLASS_CHECK(PnvPhb4PecClass, (klass), TYPE_PNV_PHB4_PEC)
+#define PNV_PHB4_PEC_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(PnvPhb4PecClass, (obj), TYPE_PNV_PHB4_PEC)
+
+typedef struct PnvPhb4PecClass {
+    DeviceClass parent_class;
+
+    uint32_t (*xscom_nest_base)(PnvPhb4PecState *pec);
+    uint32_t xscom_nest_size;
+    uint32_t (*xscom_pci_base)(PnvPhb4PecState *pec);
+    uint32_t xscom_pci_size;
+    const char *compat;
+    int compat_size;
+    const char *stk_compat;
+    int stk_compat_size;
+} PnvPhb4PecClass;
+
+#endif /* PCI_HOST_PNV_PHB4_H */
diff --git a/include/hw/pci-host/pnv_phb4_regs.h b/include/hw/pci-host/pnv_phb4_regs.h
new file mode 100644
index 0000000..55df2c3
--- /dev/null
+++ b/include/hw/pci-host/pnv_phb4_regs.h
@@ -0,0 +1,553 @@
+/*
+ * QEMU PowerPC PowerNV (POWER9) PHB4 model
+ *
+ * Copyright (c) 2013-2020, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef PCI_HOST_PNV_PHB4_REGS_H
+#define PCI_HOST_PNV_PHB4_REGS_H
+
+/*
+ * PEC XSCOM registers
+ *
+ * There a 3 PECs in P9. Each PEC can have several PHBs. Each PEC has some
+ * "global" registers and some "per-stack" (per-PHB) registers. Those are
+ * organized in two XSCOM ranges, the "Nest" range and the "PCI" range, each
+ * range contains both some "PEC" registers and some "per-stack" registers.
+ *
+ * Finally the PCI range also contains an additional range per stack that
+ * passes through to some of the PHB own registers.
+ *
+ * PEC0 can contain 1 PHB  (PHB0)
+ * PEC1 can contain 2 PHBs (PHB1 and PHB2)
+ * PEC2 can contain 3 PHBs (PHB3, PHB4 and PHB5)
+ */
+
+/*
+ * This is the "stack" offset, it's the offset from a given range base
+ * to the first "per-stack" registers and also the stride between
+ * stacks, thus for PEC2, the global registers are at offset 0, the
+ * PHB3 registers at offset 0x40, the PHB4 at offset 0x80 etc....
+ *
+ * It is *also* the offset to the pass-through SCOM region but in this case
+ * it is 0 based, ie PHB3 is at 0x100 PHB4 is a 0x140 etc..
+ */
+#define PEC_STACK_OFFSET        0x40
+
+/* XSCOM Nest global registers */
+#define PEC_NEST_PBCQ_HW_CONFIG         0x00
+#define PEC_NEST_DROP_PRIO_CTRL         0x01
+#define PEC_NEST_PBCQ_ERR_INJECT        0x02
+#define PEC_NEST_PCI_NEST_CLK_TRACE_CTL 0x03
+#define PEC_NEST_PBCQ_PMON_CTRL         0x04
+#define PEC_NEST_PBCQ_PBUS_ADDR_EXT     0x05
+#define PEC_NEST_PBCQ_PRED_VEC_TIMEOUT  0x06
+#define PEC_NEST_CAPP_CTRL              0x07
+#define PEC_NEST_PBCQ_READ_STK_OVR      0x08
+#define PEC_NEST_PBCQ_WRITE_STK_OVR     0x09
+#define PEC_NEST_PBCQ_STORE_STK_OVR     0x0a
+#define PEC_NEST_PBCQ_RETRY_BKOFF_CTRL  0x0b
+
+/* XSCOM Nest per-stack registers */
+#define PEC_NEST_STK_PCI_NEST_FIR       0x00
+#define PEC_NEST_STK_PCI_NEST_FIR_CLR   0x01
+#define PEC_NEST_STK_PCI_NEST_FIR_SET   0x02
+#define PEC_NEST_STK_PCI_NEST_FIR_MSK   0x03
+#define PEC_NEST_STK_PCI_NEST_FIR_MSKC  0x04
+#define PEC_NEST_STK_PCI_NEST_FIR_MSKS  0x05
+#define PEC_NEST_STK_PCI_NEST_FIR_ACT0  0x06
+#define PEC_NEST_STK_PCI_NEST_FIR_ACT1  0x07
+#define PEC_NEST_STK_PCI_NEST_FIR_WOF   0x08
+#define PEC_NEST_STK_ERR_REPORT_0       0x0a
+#define PEC_NEST_STK_ERR_REPORT_1       0x0b
+#define PEC_NEST_STK_PBCQ_GNRL_STATUS   0x0c
+#define PEC_NEST_STK_PBCQ_MODE          0x0d
+#define PEC_NEST_STK_MMIO_BAR0          0x0e
+#define PEC_NEST_STK_MMIO_BAR0_MASK     0x0f
+#define PEC_NEST_STK_MMIO_BAR1          0x10
+#define PEC_NEST_STK_MMIO_BAR1_MASK     0x11
+#define PEC_NEST_STK_PHB_REGS_BAR       0x12
+#define PEC_NEST_STK_INT_BAR            0x13
+#define PEC_NEST_STK_BAR_EN             0x14
+#define   PEC_NEST_STK_BAR_EN_MMIO0             PPC_BIT(0)
+#define   PEC_NEST_STK_BAR_EN_MMIO1             PPC_BIT(1)
+#define   PEC_NEST_STK_BAR_EN_PHB               PPC_BIT(2)
+#define   PEC_NEST_STK_BAR_EN_INT               PPC_BIT(3)
+#define PEC_NEST_STK_DATA_FRZ_TYPE      0x15
+#define PEC_NEST_STK_PBCQ_TUN_BAR       0x16
+
+/* XSCOM PCI global registers */
+#define PEC_PCI_PBAIB_HW_CONFIG         0x00
+#define PEC_PCI_PBAIB_READ_STK_OVR      0x02
+
+/* XSCOM PCI per-stack registers */
+#define PEC_PCI_STK_PCI_FIR             0x00
+#define PEC_PCI_STK_PCI_FIR_CLR         0x01
+#define PEC_PCI_STK_PCI_FIR_SET         0x02
+#define PEC_PCI_STK_PCI_FIR_MSK         0x03
+#define PEC_PCI_STK_PCI_FIR_MSKC        0x04
+#define PEC_PCI_STK_PCI_FIR_MSKS        0x05
+#define PEC_PCI_STK_PCI_FIR_ACT0        0x06
+#define PEC_PCI_STK_PCI_FIR_ACT1        0x07
+#define PEC_PCI_STK_PCI_FIR_WOF         0x08
+#define PEC_PCI_STK_ETU_RESET           0x0a
+#define PEC_PCI_STK_PBAIB_ERR_REPORT    0x0b
+#define PEC_PCI_STK_PBAIB_TX_CMD_CRED   0x0d
+#define PEC_PCI_STK_PBAIB_TX_DAT_CRED   0x0e
+
+/*
+ * PHB "SCOM" registers. This is accessed via the above window
+ * and provides a backdoor to the PHB when the AIB bus is not
+ * functional. Some of these directly map some of the PHB MMIO
+ * registers, some are specific and allow indirect access to a
+ * wider range of PHB registers
+ */
+#define PHB_SCOM_HV_IND_ADDR            0x00
+#define   PHB_SCOM_HV_IND_ADDR_VALID            PPC_BIT(0)
+#define   PHB_SCOM_HV_IND_ADDR_4B               PPC_BIT(1)
+#define   PHB_SCOM_HV_IND_ADDR_AUTOINC          PPC_BIT(2)
+#define   PHB_SCOM_HV_IND_ADDR_ADDR             PPC_BITMASK(51, 63)
+#define PHB_SCOM_HV_IND_DATA            0x01
+#define PHB_SCOM_ETU_LEM_FIR            0x08
+#define PHB_SCOM_ETU_LEM_FIR_AND        0x09
+#define PHB_SCOM_ETU_LEM_FIR_OR         0x0a
+#define PHB_SCOM_ETU_LEM_FIR_MSK        0x0b
+#define PHB_SCOM_ETU_LEM_ERR_MSK_AND    0x0c
+#define PHB_SCOM_ETU_LEM_ERR_MSK_OR     0x0d
+#define PHB_SCOM_ETU_LEM_ACT0           0x0e
+#define PHB_SCOM_ETU_LEM_ACT1           0x0f
+#define PHB_SCOM_ETU_LEM_WOF            0x10
+#define PHB_SCOM_ETU_PMON_CONFIG        0x17
+#define PHB_SCOM_ETU_PMON_CTR0          0x18
+#define PHB_SCOM_ETU_PMON_CTR1          0x19
+#define PHB_SCOM_ETU_PMON_CTR2          0x1a
+#define PHB_SCOM_ETU_PMON_CTR3          0x1b
+
+
+/*
+ * PHB MMIO registers
+ */
+
+/* PHB Fundamental register set A */
+#define PHB_LSI_SOURCE_ID               0x100
+#define   PHB_LSI_SRC_ID                PPC_BITMASK(4, 12)
+#define PHB_DMA_CHAN_STATUS             0x110
+#define   PHB_DMA_CHAN_ANY_ERR          PPC_BIT(27)
+#define   PHB_DMA_CHAN_ANY_ERR1         PPC_BIT(28)
+#define   PHB_DMA_CHAN_ANY_FREEZE       PPC_BIT(29)
+#define PHB_CPU_LOADSTORE_STATUS        0x120
+#define   PHB_CPU_LS_ANY_ERR            PPC_BIT(27)
+#define   PHB_CPU_LS_ANY_ERR1           PPC_BIT(28)
+#define   PHB_CPU_LS_ANY_FREEZE         PPC_BIT(29)
+#define PHB_CONFIG_DATA                 0x130
+#define PHB_LOCK0                       0x138
+#define PHB_CONFIG_ADDRESS              0x140
+#define   PHB_CA_ENABLE                 PPC_BIT(0)
+#define   PHB_CA_STATUS                 PPC_BITMASK(1, 3)
+#define     PHB_CA_STATUS_GOOD          0
+#define     PHB_CA_STATUS_UR            1
+#define     PHB_CA_STATUS_CRS           2
+#define     PHB_CA_STATUS_CA            4
+#define   PHB_CA_BUS                    PPC_BITMASK(4, 11)
+#define   PHB_CA_DEV                    PPC_BITMASK(12, 16)
+#define   PHB_CA_FUNC                   PPC_BITMASK(17, 19)
+#define   PHB_CA_BDFN                   PPC_BITMASK(4, 19) /* bus,dev,func */
+#define   PHB_CA_REG                    PPC_BITMASK(20, 31)
+#define   PHB_CA_PE                     PPC_BITMASK(39, 47)
+#define PHB_LOCK1                       0x148
+#define PHB_PHB4_CONFIG                 0x160
+#define   PHB_PHB4C_32BIT_MSI_EN        PPC_BIT(8)
+#define   PHB_PHB4C_64BIT_MSI_EN        PPC_BIT(14)
+#define PHB_RTT_BAR                     0x168
+#define   PHB_RTT_BAR_ENABLE            PPC_BIT(0)
+#define   PHB_RTT_BASE_ADDRESS_MASK     PPC_BITMASK(8, 46)
+#define PHB_PELTV_BAR                   0x188
+#define   PHB_PELTV_BAR_ENABLE          PPC_BIT(0)
+#define   PHB_PELTV_BASE_ADDRESS        PPC_BITMASK(8, 50)
+#define PHB_M32_START_ADDR              0x1a0
+#define PHB_PEST_BAR                    0x1a8
+#define   PHB_PEST_BAR_ENABLE           PPC_BIT(0)
+#define   PHB_PEST_BASE_ADDRESS         PPC_BITMASK(8, 51)
+#define PHB_ASN_CMPM                    0x1C0
+#define   PHB_ASN_CMPM_ENABLE           PPC_BIT(63)
+#define PHB_CAPI_CMPM                   0x1C8
+#define   PHB_CAPI_CMPM_ENABLE          PPC_BIT(63)
+#define PHB_M64_AOMASK                  0x1d0
+#define PHB_M64_UPPER_BITS              0x1f0
+#define PHB_NXLATE_PREFIX               0x1f8
+#define PHB_DMARD_SYNC                  0x200
+#define   PHB_DMARD_SYNC_START          PPC_BIT(0)
+#define   PHB_DMARD_SYNC_COMPLETE       PPC_BIT(1)
+#define PHB_RTC_INVALIDATE              0x208
+#define   PHB_RTC_INVALIDATE_ALL        PPC_BIT(0)
+#define   PHB_RTC_INVALIDATE_RID        PPC_BITMASK(16, 31)
+#define PHB_TCE_KILL                    0x210
+#define   PHB_TCE_KILL_ALL              PPC_BIT(0)
+#define   PHB_TCE_KILL_PE               PPC_BIT(1)
+#define   PHB_TCE_KILL_ONE              PPC_BIT(2)
+#define   PHB_TCE_KILL_PSEL             PPC_BIT(3)
+#define   PHB_TCE_KILL_64K              0x1000 /* Address override */
+#define   PHB_TCE_KILL_2M               0x2000 /* Address override */
+#define   PHB_TCE_KILL_1G               0x3000 /* Address override */
+#define   PHB_TCE_KILL_PENUM            PPC_BITMASK(55, 63)
+#define PHB_TCE_SPEC_CTL                0x218
+#define PHB_IODA_ADDR                   0x220
+#define   PHB_IODA_AD_AUTOINC           PPC_BIT(0)
+#define   PHB_IODA_AD_TSEL              PPC_BITMASK(11, 15)
+#define   PHB_IODA_AD_MIST_PWV          PPC_BITMASK(28, 31)
+#define   PHB_IODA_AD_TADR              PPC_BITMASK(54, 63)
+#define PHB_IODA_DATA0                  0x228
+#define PHB_PHB4_GEN_CAP                0x250
+#define PHB_PHB4_TCE_CAP                0x258
+#define PHB_PHB4_IRQ_CAP                0x260
+#define PHB_PHB4_EEH_CAP                0x268
+#define PHB_PAPR_ERR_INJ_CTL            0x2b0
+#define   PHB_PAPR_ERR_INJ_CTL_INB      PPC_BIT(0)
+#define   PHB_PAPR_ERR_INJ_CTL_OUTB     PPC_BIT(1)
+#define   PHB_PAPR_ERR_INJ_CTL_STICKY   PPC_BIT(2)
+#define   PHB_PAPR_ERR_INJ_CTL_CFG      PPC_BIT(3)
+#define   PHB_PAPR_ERR_INJ_CTL_RD       PPC_BIT(4)
+#define   PHB_PAPR_ERR_INJ_CTL_WR       PPC_BIT(5)
+#define   PHB_PAPR_ERR_INJ_CTL_FREEZE   PPC_BIT(6)
+#define PHB_PAPR_ERR_INJ_ADDR           0x2b8
+#define   PHB_PAPR_ERR_INJ_ADDR_MMIO            PPC_BITMASK(16, 63)
+#define PHB_PAPR_ERR_INJ_MASK           0x2c0
+#define   PHB_PAPR_ERR_INJ_MASK_CFG             PPC_BITMASK(4, 11)
+#define   PHB_PAPR_ERR_INJ_MASK_CFG_ALL         PPC_BITMASK(4, 19)
+#define   PHB_PAPR_ERR_INJ_MASK_MMIO            PPC_BITMASK(16, 63)
+#define PHB_ETU_ERR_SUMMARY             0x2c8
+#define PHB_INT_NOTIFY_ADDR             0x300
+#define PHB_INT_NOTIFY_INDEX            0x308
+
+/* Fundamental register set B */
+#define PHB_VERSION                     0x800
+#define PHB_CTRLR                       0x810
+#define   PHB_CTRLR_IRQ_PGSZ_64K        PPC_BIT(11)
+#define   PHB_CTRLR_IRQ_STORE_EOI       PPC_BIT(12)
+#define   PHB_CTRLR_MMIO_RD_STRICT      PPC_BIT(13)
+#define   PHB_CTRLR_MMIO_EEH_DISABLE    PPC_BIT(14)
+#define   PHB_CTRLR_CFG_EEH_BLOCK       PPC_BIT(15)
+#define   PHB_CTRLR_FENCE_LNKILL_DIS    PPC_BIT(16)
+#define   PHB_CTRLR_TVT_ADDR_SEL        PPC_BITMASK(17, 19)
+#define     TVT_DD1_1_PER_PE            0
+#define     TVT_DD1_2_PER_PE            1
+#define     TVT_DD1_4_PER_PE            2
+#define     TVT_DD1_8_PER_PE            3
+#define     TVT_DD1_16_PER_PE           4
+#define     TVT_2_PER_PE                0
+#define     TVT_4_PER_PE                1
+#define     TVT_8_PER_PE                2
+#define     TVT_16_PER_PE               3
+#define   PHB_CTRLR_DMA_RD_SPACING      PPC_BITMASK(28, 31)
+#define PHB_AIB_FENCE_CTRL              0x860
+#define PHB_TCE_TAG_ENABLE              0x868
+#define PHB_TCE_WATERMARK               0x870
+#define PHB_TIMEOUT_CTRL1               0x878
+#define PHB_TIMEOUT_CTRL2               0x880
+#define PHB_Q_DMA_R                     0x888
+#define   PHB_Q_DMA_R_QUIESCE_DMA       PPC_BIT(0)
+#define   PHB_Q_DMA_R_AUTORESET         PPC_BIT(1)
+#define   PHB_Q_DMA_R_DMA_RESP_STATUS   PPC_BIT(4)
+#define   PHB_Q_DMA_R_MMIO_RESP_STATUS  PPC_BIT(5)
+#define   PHB_Q_DMA_R_TCE_RESP_STATUS   PPC_BIT(6)
+#define   PHB_Q_DMA_R_TCE_KILL_STATUS   PPC_BIT(7)
+#define PHB_TCE_TAG_STATUS              0x908
+
+/* FIR & Error registers */
+#define PHB_LEM_FIR_ACCUM               0xc00
+#define PHB_LEM_FIR_AND_MASK            0xc08
+#define PHB_LEM_FIR_OR_MASK             0xc10
+#define PHB_LEM_ERROR_MASK              0xc18
+#define PHB_LEM_ERROR_AND_MASK          0xc20
+#define PHB_LEM_ERROR_OR_MASK           0xc28
+#define PHB_LEM_ACTION0                 0xc30
+#define PHB_LEM_ACTION1                 0xc38
+#define PHB_LEM_WOF                     0xc40
+#define PHB_ERR_STATUS                  0xc80
+#define PHB_ERR1_STATUS                 0xc88
+#define PHB_ERR_INJECT                  0xc90
+#define PHB_ERR_LEM_ENABLE              0xc98
+#define PHB_ERR_IRQ_ENABLE              0xca0
+#define PHB_ERR_FREEZE_ENABLE           0xca8
+#define PHB_ERR_AIB_FENCE_ENABLE        0xcb0
+#define PHB_ERR_LOG_0                   0xcc0
+#define PHB_ERR_LOG_1                   0xcc8
+#define PHB_ERR_STATUS_MASK             0xcd0
+#define PHB_ERR1_STATUS_MASK            0xcd8
+
+#define PHB_TXE_ERR_STATUS                      0xd00
+#define PHB_TXE_ERR1_STATUS                     0xd08
+#define PHB_TXE_ERR_INJECT                      0xd10
+#define PHB_TXE_ERR_LEM_ENABLE                  0xd18
+#define PHB_TXE_ERR_IRQ_ENABLE                  0xd20
+#define PHB_TXE_ERR_FREEZE_ENABLE               0xd28
+#define PHB_TXE_ERR_AIB_FENCE_ENABLE            0xd30
+#define PHB_TXE_ERR_LOG_0                       0xd40
+#define PHB_TXE_ERR_LOG_1                       0xd48
+#define PHB_TXE_ERR_STATUS_MASK                 0xd50
+#define PHB_TXE_ERR1_STATUS_MASK                0xd58
+
+#define PHB_RXE_ARB_ERR_STATUS                  0xd80
+#define PHB_RXE_ARB_ERR1_STATUS                 0xd88
+#define PHB_RXE_ARB_ERR_INJECT                  0xd90
+#define PHB_RXE_ARB_ERR_LEM_ENABLE              0xd98
+#define PHB_RXE_ARB_ERR_IRQ_ENABLE              0xda0
+#define PHB_RXE_ARB_ERR_FREEZE_ENABLE           0xda8
+#define PHB_RXE_ARB_ERR_AIB_FENCE_ENABLE        0xdb0
+#define PHB_RXE_ARB_ERR_LOG_0                   0xdc0
+#define PHB_RXE_ARB_ERR_LOG_1                   0xdc8
+#define PHB_RXE_ARB_ERR_STATUS_MASK             0xdd0
+#define PHB_RXE_ARB_ERR1_STATUS_MASK            0xdd8
+
+#define PHB_RXE_MRG_ERR_STATUS                  0xe00
+#define PHB_RXE_MRG_ERR1_STATUS                 0xe08
+#define PHB_RXE_MRG_ERR_INJECT                  0xe10
+#define PHB_RXE_MRG_ERR_LEM_ENABLE              0xe18
+#define PHB_RXE_MRG_ERR_IRQ_ENABLE              0xe20
+#define PHB_RXE_MRG_ERR_FREEZE_ENABLE           0xe28
+#define PHB_RXE_MRG_ERR_AIB_FENCE_ENABLE        0xe30
+#define PHB_RXE_MRG_ERR_LOG_0                   0xe40
+#define PHB_RXE_MRG_ERR_LOG_1                   0xe48
+#define PHB_RXE_MRG_ERR_STATUS_MASK             0xe50
+#define PHB_RXE_MRG_ERR1_STATUS_MASK            0xe58
+
+#define PHB_RXE_TCE_ERR_STATUS                  0xe80
+#define PHB_RXE_TCE_ERR1_STATUS                 0xe88
+#define PHB_RXE_TCE_ERR_INJECT                  0xe90
+#define PHB_RXE_TCE_ERR_LEM_ENABLE              0xe98
+#define PHB_RXE_TCE_ERR_IRQ_ENABLE              0xea0
+#define PHB_RXE_TCE_ERR_FREEZE_ENABLE           0xea8
+#define PHB_RXE_TCE_ERR_AIB_FENCE_ENABLE        0xeb0
+#define PHB_RXE_TCE_ERR_LOG_0                   0xec0
+#define PHB_RXE_TCE_ERR_LOG_1                   0xec8
+#define PHB_RXE_TCE_ERR_STATUS_MASK             0xed0
+#define PHB_RXE_TCE_ERR1_STATUS_MASK            0xed8
+
+/* Performance monitor & Debug registers */
+#define PHB_TRACE_CONTROL                       0xf80
+#define PHB_PERFMON_CONFIG                      0xf88
+#define PHB_PERFMON_CTR0                        0xf90
+#define PHB_PERFMON_CTR1                        0xf98
+#define PHB_PERFMON_CTR2                        0xfa0
+#define PHB_PERFMON_CTR3                        0xfa8
+
+/* Root complex config space memory mapped */
+#define PHB_RC_CONFIG_BASE                      0x1000
+#define   PHB_RC_CONFIG_SIZE                    0x800
+
+/* PHB4 REGB registers */
+
+/* PBL core */
+#define PHB_PBL_CONTROL                         0x1800
+#define PHB_PBL_TIMEOUT_CTRL                    0x1810
+#define PHB_PBL_NPTAG_ENABLE                    0x1820
+#define PHB_PBL_NBW_CMP_MASK                    0x1830
+#define   PHB_PBL_NBW_MASK_ENABLE               PPC_BIT(63)
+#define PHB_PBL_SYS_LINK_INIT                   0x1838
+#define PHB_PBL_BUF_STATUS                      0x1840
+#define PHB_PBL_ERR_STATUS                      0x1900
+#define PHB_PBL_ERR1_STATUS                     0x1908
+#define PHB_PBL_ERR_INJECT                      0x1910
+#define PHB_PBL_ERR_INF_ENABLE                  0x1920
+#define PHB_PBL_ERR_ERC_ENABLE                  0x1928
+#define PHB_PBL_ERR_FAT_ENABLE                  0x1930
+#define PHB_PBL_ERR_LOG_0                       0x1940
+#define PHB_PBL_ERR_LOG_1                       0x1948
+#define PHB_PBL_ERR_STATUS_MASK                 0x1950
+#define PHB_PBL_ERR1_STATUS_MASK                0x1958
+
+/* PCI-E stack */
+#define PHB_PCIE_SCR                    0x1A00
+#define   PHB_PCIE_SCR_SLOT_CAP         PPC_BIT(15)
+#define   PHB_PCIE_SCR_MAXLINKSPEED     PPC_BITMASK(32, 35)
+
+
+#define PHB_PCIE_CRESET                 0x1A10
+#define   PHB_PCIE_CRESET_CFG_CORE      PPC_BIT(0)
+#define   PHB_PCIE_CRESET_TLDLP         PPC_BIT(1)
+#define   PHB_PCIE_CRESET_PBL           PPC_BIT(2)
+#define   PHB_PCIE_CRESET_PERST_N       PPC_BIT(3)
+#define   PHB_PCIE_CRESET_PIPE_N        PPC_BIT(4)
+
+
+#define PHB_PCIE_HOTPLUG_STATUS         0x1A20
+#define   PHB_PCIE_HPSTAT_PRESENCE      PPC_BIT(10)
+
+#define PHB_PCIE_DLP_TRAIN_CTL          0x1A40
+#define   PHB_PCIE_DLP_LINK_WIDTH       PPC_BITMASK(30, 35)
+#define   PHB_PCIE_DLP_LINK_SPEED       PPC_BITMASK(36, 39)
+#define   PHB_PCIE_DLP_LTSSM_TRC        PPC_BITMASK(24, 27)
+#define     PHB_PCIE_DLP_LTSSM_RESET    0
+#define     PHB_PCIE_DLP_LTSSM_DETECT   1
+#define     PHB_PCIE_DLP_LTSSM_POLLING  2
+#define     PHB_PCIE_DLP_LTSSM_CONFIG   3
+#define     PHB_PCIE_DLP_LTSSM_L0       4
+#define     PHB_PCIE_DLP_LTSSM_REC      5
+#define     PHB_PCIE_DLP_LTSSM_L1       6
+#define     PHB_PCIE_DLP_LTSSM_L2       7
+#define     PHB_PCIE_DLP_LTSSM_HOTRESET 8
+#define     PHB_PCIE_DLP_LTSSM_DISABLED 9
+#define     PHB_PCIE_DLP_LTSSM_LOOPBACK 10
+#define   PHB_PCIE_DLP_TL_LINKACT       PPC_BIT(23)
+#define   PHB_PCIE_DLP_DL_PGRESET       PPC_BIT(22)
+#define   PHB_PCIE_DLP_TRAINING         PPC_BIT(20)
+#define   PHB_PCIE_DLP_INBAND_PRESENCE  PPC_BIT(19)
+
+#define PHB_PCIE_DLP_CTL                0x1A78
+#define   PHB_PCIE_DLP_CTL_BYPASS_PH2   PPC_BIT(4)
+#define   PHB_PCIE_DLP_CTL_BYPASS_PH3   PPC_BIT(5)
+
+#define PHB_PCIE_DLP_TRWCTL             0x1A80
+#define   PHB_PCIE_DLP_TRWCTL_EN        PPC_BIT(0)
+
+#define PHB_PCIE_DLP_ERRLOG1            0x1AA0
+#define PHB_PCIE_DLP_ERRLOG2            0x1AA8
+#define PHB_PCIE_DLP_ERR_STATUS         0x1AB0
+#define PHB_PCIE_DLP_ERR_COUNTERS       0x1AB8
+
+#define PHB_PCIE_LANE_EQ_CNTL0          0x1AD0
+#define PHB_PCIE_LANE_EQ_CNTL1          0x1AD8
+#define PHB_PCIE_LANE_EQ_CNTL2          0x1AE0
+#define PHB_PCIE_LANE_EQ_CNTL3          0x1AE8
+#define PHB_PCIE_LANE_EQ_CNTL20         0x1AF0
+#define PHB_PCIE_LANE_EQ_CNTL21         0x1AF8
+#define PHB_PCIE_LANE_EQ_CNTL22         0x1B00 /* DD1 only */
+#define PHB_PCIE_LANE_EQ_CNTL23         0x1B08 /* DD1 only */
+#define PHB_PCIE_TRACE_CTRL             0x1B20
+#define PHB_PCIE_MISC_STRAP             0x1B30
+
+/* Error */
+#define PHB_REGB_ERR_STATUS             0x1C00
+#define PHB_REGB_ERR1_STATUS            0x1C08
+#define PHB_REGB_ERR_INJECT             0x1C10
+#define PHB_REGB_ERR_INF_ENABLE         0x1C20
+#define PHB_REGB_ERR_ERC_ENABLE         0x1C28
+#define PHB_REGB_ERR_FAT_ENABLE         0x1C30
+#define PHB_REGB_ERR_LOG_0              0x1C40
+#define PHB_REGB_ERR_LOG_1              0x1C48
+#define PHB_REGB_ERR_STATUS_MASK        0x1C50
+#define PHB_REGB_ERR1_STATUS_MASK       0x1C58
+
+/*
+ * IODA3 on-chip tables
+ */
+
+#define IODA3_TBL_LIST          1
+#define IODA3_TBL_MIST          2
+#define IODA3_TBL_RCAM          5
+#define IODA3_TBL_MRT           6
+#define IODA3_TBL_PESTA         7
+#define IODA3_TBL_PESTB         8
+#define IODA3_TBL_TVT           9
+#define IODA3_TBL_TCR           10
+#define IODA3_TBL_TDR           11
+#define IODA3_TBL_MBT           16
+#define IODA3_TBL_MDT           17
+#define IODA3_TBL_PEEV          20
+
+/* LIST */
+#define IODA3_LIST_P                    PPC_BIT(6)
+#define IODA3_LIST_Q                    PPC_BIT(7)
+#define IODA3_LIST_STATE                PPC_BIT(14)
+
+/* MIST */
+#define IODA3_MIST_P3                   PPC_BIT(48 + 0)
+#define IODA3_MIST_Q3                   PPC_BIT(48 + 1)
+#define IODA3_MIST_PE3                  PPC_BITMASK(48 + 4, 48 + 15)
+
+/* TVT */
+#define IODA3_TVT_TABLE_ADDR            PPC_BITMASK(0, 47)
+#define IODA3_TVT_NUM_LEVELS            PPC_BITMASK(48, 50)
+#define   IODA3_TVE_1_LEVEL     0
+#define   IODA3_TVE_2_LEVELS    1
+#define   IODA3_TVE_3_LEVELS    2
+#define   IODA3_TVE_4_LEVELS    3
+#define   IODA3_TVE_5_LEVELS    4
+#define IODA3_TVT_TCE_TABLE_SIZE        PPC_BITMASK(51, 55)
+#define IODA3_TVT_NON_TRANSLATE_50      PPC_BIT(56)
+#define IODA3_TVT_IO_PSIZE              PPC_BITMASK(59, 63)
+
+/* PESTA */
+#define IODA3_PESTA_MMIO_FROZEN         PPC_BIT(0)
+#define IODA3_PESTA_TRANS_TYPE          PPC_BITMASK(5, 7)
+#define  IODA3_PESTA_TRANS_TYPE_MMIOLOAD 0x4
+#define IODA3_PESTA_CA_CMPLT_TMT        PPC_BIT(8)
+#define IODA3_PESTA_UR                  PPC_BIT(9)
+
+/* PESTB */
+#define IODA3_PESTB_DMA_STOPPED         PPC_BIT(0)
+
+/* MDT */
+/* FIXME: check this field with Eric and add a B, C and D */
+#define IODA3_MDT_PE_A                  PPC_BITMASK(0, 15)
+#define IODA3_MDT_PE_B                  PPC_BITMASK(16, 31)
+#define IODA3_MDT_PE_C                  PPC_BITMASK(32, 47)
+#define IODA3_MDT_PE_D                  PPC_BITMASK(48, 63)
+
+/* MBT */
+#define IODA3_MBT0_ENABLE               PPC_BIT(0)
+#define IODA3_MBT0_TYPE                 PPC_BIT(1)
+#define   IODA3_MBT0_TYPE_M32           IODA3_MBT0_TYPE
+#define   IODA3_MBT0_TYPE_M64           0
+#define IODA3_MBT0_MODE                 PPC_BITMASK(2, 3)
+#define   IODA3_MBT0_MODE_PE_SEG        0
+#define   IODA3_MBT0_MODE_MDT           1
+#define   IODA3_MBT0_MODE_SINGLE_PE     2
+#define IODA3_MBT0_SEG_DIV              PPC_BITMASK(4, 5)
+#define   IODA3_MBT0_SEG_DIV_MAX        0
+#define   IODA3_MBT0_SEG_DIV_128        1
+#define   IODA3_MBT0_SEG_DIV_64         2
+#define   IODA3_MBT0_SEG_DIV_8          3
+#define IODA3_MBT0_MDT_COLUMN           PPC_BITMASK(4, 5)
+#define IODA3_MBT0_BASE_ADDR            PPC_BITMASK(8, 51)
+
+#define IODA3_MBT1_ENABLE               PPC_BIT(0)
+#define IODA3_MBT1_MASK                 PPC_BITMASK(8, 51)
+#define IODA3_MBT1_SEG_BASE             PPC_BITMASK(55, 63)
+#define IODA3_MBT1_SINGLE_PE_NUM        PPC_BITMASK(55, 63)
+
+/*
+ * IODA3 in-memory tables
+ */
+
+/*
+ * PEST
+ *
+ * 2x8 bytes entries, PEST0 and PEST1
+ */
+
+#define IODA3_PEST0_MMIO_CAUSE          PPC_BIT(2)
+#define IODA3_PEST0_CFG_READ            PPC_BIT(3)
+#define IODA3_PEST0_CFG_WRITE           PPC_BIT(4)
+#define IODA3_PEST0_TTYPE               PPC_BITMASK(5, 7)
+#define   PEST_TTYPE_DMA_WRITE          0
+#define   PEST_TTYPE_MSI                1
+#define   PEST_TTYPE_DMA_READ           2
+#define   PEST_TTYPE_DMA_READ_RESP      3
+#define   PEST_TTYPE_MMIO_LOAD          4
+#define   PEST_TTYPE_MMIO_STORE         5
+#define   PEST_TTYPE_OTHER              7
+#define IODA3_PEST0_CA_RETURN           PPC_BIT(8)
+#define IODA3_PEST0_UR_RETURN           PPC_BIT(9)
+#define IODA3_PEST0_PCIE_NONFATAL       PPC_BIT(10)
+#define IODA3_PEST0_PCIE_FATAL          PPC_BIT(11)
+#define IODA3_PEST0_PARITY_UE           PPC_BIT(13)
+#define IODA3_PEST0_PCIE_CORRECTABLE    PPC_BIT(14)
+#define IODA3_PEST0_PCIE_INTERRUPT      PPC_BIT(15)
+#define IODA3_PEST0_MMIO_XLATE          PPC_BIT(16)
+#define IODA3_PEST0_IODA3_ERROR         PPC_BIT(16) /* Same bit as MMIO xlate */
+#define IODA3_PEST0_TCE_PAGE_FAULT      PPC_BIT(18)
+#define IODA3_PEST0_TCE_ACCESS_FAULT    PPC_BIT(19)
+#define IODA3_PEST0_DMA_RESP_TIMEOUT    PPC_BIT(20)
+#define IODA3_PEST0_AIB_SIZE_INVALID    PPC_BIT(21)
+#define IODA3_PEST0_LEM_BIT             PPC_BITMASK(26, 31)
+#define IODA3_PEST0_RID                 PPC_BITMASK(32, 47)
+#define IODA3_PEST0_MSI_DATA            PPC_BITMASK(48, 63)
+
+#define IODA3_PEST1_FAIL_ADDR           PPC_BITMASK(3, 63)
+
+
+#endif /* PCI_HOST_PNV_PHB4_REGS_H */
diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h
index 7515430..4b3d254 100644
--- a/include/hw/pci/pcie_port.h
+++ b/include/hw/pci/pcie_port.h
@@ -72,6 +72,7 @@
 typedef struct PCIERootPortClass {
     PCIDeviceClass parent_class;
     DeviceRealize parent_realize;
+    DeviceReset parent_reset;
 
     uint8_t (*aer_vector)(const PCIDevice *dev);
     int (*interrupts_init)(PCIDevice *dev, Error **errp);
diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index d65dd32..fb4d0c0 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -30,6 +30,8 @@
 #include "hw/ppc/pnv_homer.h"
 #include "hw/ppc/pnv_xive.h"
 #include "hw/ppc/pnv_core.h"
+#include "hw/pci-host/pnv_phb3.h"
+#include "hw/pci-host/pnv_phb4.h"
 
 #define TYPE_PNV_CHIP "pnv-chip"
 #define PNV_CHIP(obj) OBJECT_CHECK(PnvChip, (obj), TYPE_PNV_CHIP)
@@ -52,6 +54,8 @@
     uint64_t     cores_mask;
     PnvCore      **cores;
 
+    uint32_t     num_phbs;
+
     MemoryRegion xscom_mmio;
     MemoryRegion xscom;
     AddressSpace xscom_as;
@@ -74,6 +78,9 @@
     PnvOCC       occ;
     PnvHomer     homer;
 
+#define PNV8_CHIP_PHB3_MAX 4
+    PnvPHB3      phbs[PNV8_CHIP_PHB3_MAX];
+
     XICSFabric    *xics;
 } Pnv8Chip;
 
@@ -93,6 +100,9 @@
 
     uint32_t     nr_quads;
     PnvQuad      *quads;
+
+#define PNV9_CHIP_MAX_PEC 3
+    PnvPhb4PecState pecs[PNV9_CHIP_MAX_PEC];
 } Pnv9Chip;
 
 /*
@@ -120,6 +130,7 @@
     /*< public >*/
     uint64_t     chip_cfam_id;
     uint64_t     cores_mask;
+    uint32_t     num_phbs;
 
     DeviceRealize parent_realize;
 
@@ -217,6 +228,8 @@
     Notifier     powerdown_notifier;
 
     PnvPnor      *pnor;
+
+    hwaddr       fw_load_addr;
 };
 
 #define PNV_FDT_ADDR          0x01000000
diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
index 55eee95..113550e 100644
--- a/include/hw/ppc/pnv_core.h
+++ b/include/hw/ppc/pnv_core.h
@@ -40,6 +40,7 @@
     /*< public >*/
     PowerPCCPU **threads;
     uint32_t pir;
+    uint64_t hrmor;
     PnvChip *chip;
 
     MemoryRegion xscom_regs;
diff --git a/include/hw/ppc/pnv_xscom.h b/include/hw/ppc/pnv_xscom.h
index f74c81a..09156a5 100644
--- a/include/hw/ppc/pnv_xscom.h
+++ b/include/hw/ppc/pnv_xscom.h
@@ -71,6 +71,15 @@
 #define PNV_XSCOM_PBA_BASE        0x2013f00
 #define PNV_XSCOM_PBA_SIZE        0x40
 
+#define PNV_XSCOM_PBCQ_NEST_BASE  0x2012000
+#define PNV_XSCOM_PBCQ_NEST_SIZE  0x46
+
+#define PNV_XSCOM_PBCQ_PCI_BASE   0x9012000
+#define PNV_XSCOM_PBCQ_PCI_SIZE   0x15
+
+#define PNV_XSCOM_PBCQ_SPCI_BASE  0x9013c00
+#define PNV_XSCOM_PBCQ_SPCI_SIZE  0x5
+
 /*
  * Layout of the XSCOM PCB addresses (POWER 9)
  */
@@ -94,6 +103,17 @@
 #define PNV9_XSCOM_XIVE_BASE      0x5013000
 #define PNV9_XSCOM_XIVE_SIZE      0x300
 
+#define PNV9_XSCOM_PEC_NEST_BASE  0x4010c00
+#define PNV9_XSCOM_PEC_NEST_SIZE  0x100
+
+#define PNV9_XSCOM_PEC_PCI_BASE   0xd010800
+#define PNV9_XSCOM_PEC_PCI_SIZE   0x200
+
+/* XSCOM PCI "pass-through" window to PHB SCOM */
+#define PNV9_XSCOM_PEC_PCI_STK0   0x100
+#define PNV9_XSCOM_PEC_PCI_STK1   0x140
+#define PNV9_XSCOM_PEC_PCI_STK2   0x180
+
 /*
  * Layout of the XSCOM PCB addresses (POWER 10)
  */
diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
index 4ea5436..93e614c 100644
--- a/include/hw/ppc/ppc.h
+++ b/include/hw/ppc/ppc.h
@@ -68,7 +68,6 @@
 void ppc40x_core_reset(PowerPCCPU *cpu);
 void ppc40x_chip_reset(PowerPCCPU *cpu);
 void ppc40x_system_reset(PowerPCCPU *cpu);
-void PPC_debug_write (void *opaque, uint32_t addr, uint32_t val);
 
 #if defined(CONFIG_USER_ONLY)
 static inline void ppc40x_irq_init(PowerPCCPU *cpu) {}
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 61f005c..a1fba95 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -79,8 +79,10 @@
 #define SPAPR_CAP_LARGE_DECREMENTER     0x08
 /* Count Cache Flush Assist HW Instruction */
 #define SPAPR_CAP_CCF_ASSIST            0x09
+/* FWNMI machine check handling */
+#define SPAPR_CAP_FWNMI_MCE             0x0A
 /* Num Caps */
-#define SPAPR_CAP_NUM                   (SPAPR_CAP_CCF_ASSIST + 1)
+#define SPAPR_CAP_NUM                   (SPAPR_CAP_FWNMI_MCE + 1)
 
 /*
  * Capability Values
@@ -189,6 +191,15 @@
      * occurs during the unplug process. */
     QTAILQ_HEAD(, SpaprDimmState) pending_dimm_unplugs;
 
+    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
+    target_ulong guest_machine_check_addr;
+    /*
+     * mc_status is set to -1 if mc is not in progress, else is set to the CPU
+     * handling the mc.
+     */
+    int mc_status;
+    QemuCond mc_delivery_cond;
+
     /*< public >*/
     char *kvm_type;
     char *host_model;
@@ -207,6 +218,8 @@
 
     unsigned gpu_numa_id;
     SpaprTpmProxy *tpm_proxy;
+
+    Error *fwnmi_migration_blocker;
 };
 
 #define H_SUCCESS         0
@@ -645,8 +658,10 @@
 #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
 #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
 #define RTAS_IBM_SUSPEND_ME                     (RTAS_TOKEN_BASE + 0x2A)
+#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2B)
+#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2C)
 
-#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2B)
+#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2D)
 
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
@@ -716,6 +731,9 @@
 
 #define RTAS_ERROR_LOG_MAX      2048
 
+/* Offset from rtas-base where error log is placed */
+#define RTAS_ERROR_LOG_OFFSET       0x30
+
 #define RTAS_EVENT_SCAN_RATE    1
 
 /* This helper should be used to encode interrupt specifiers when the related
@@ -802,6 +820,7 @@
 int spapr_max_server_number(SpaprMachineState *spapr);
 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
                       uint64_t pte0, uint64_t pte1);
+void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
 
 /* DRC callbacks. */
 void spapr_core_release(DeviceState *dev);
@@ -869,6 +888,7 @@
 extern const VMStateDescription vmstate_spapr_cap_nested_kvm_hv;
 extern const VMStateDescription vmstate_spapr_cap_large_decr;
 extern const VMStateDescription vmstate_spapr_cap_ccf_assist;
+extern const VMStateDescription vmstate_spapr_cap_fwnmi;
 
 static inline uint8_t spapr_get_cap(SpaprMachineState *spapr, int cap)
 {
@@ -891,4 +911,5 @@
 #define SPAPR_OV5_XIVE_BOTH     0x80 /* Only to advertise on the platform */
 
 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
+hwaddr spapr_get_rtas_addr(void);
 #endif /* HW_SPAPR_H */
diff --git a/include/hw/ppc/spapr_vio.h b/include/hw/ppc/spapr_vio.h
index ce6d9b0..bed7df6 100644
--- a/include/hw/ppc/spapr_vio.h
+++ b/include/hw/ppc/spapr_vio.h
@@ -58,6 +58,7 @@
     void (*realize)(SpaprVioDevice *dev, Error **errp);
     void (*reset)(SpaprVioDevice *dev);
     int (*devnode)(SpaprVioDevice *dev, void *fdt, int node_off);
+    const char *(*get_dt_compatible)(SpaprVioDevice *dev);
 } SpaprVioDeviceClass;
 
 struct SpaprVioDevice {
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 48a75aa..9ed58ec 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -101,6 +101,10 @@
     DeviceClass parent_class;
 
     DeviceRealize parent_realize;
+    DeviceReset parent_reset;
+
+    void (*reject)(ICSState *s, uint32_t irq);
+    void (*resend)(ICSState *s);
 };
 
 struct ICSState {
@@ -161,6 +165,7 @@
 uint32_t icp_accept(ICPState *ss);
 uint32_t icp_ipoll(ICPState *ss, uint32_t *mfrr);
 void icp_eoi(ICPState *icp, uint32_t xirr);
+void icp_irq(ICSState *ics, int server, int nr, uint8_t priority);
 void icp_reset(ICPState *icp);
 
 void ics_write_xive(ICSState *ics, int nr, int server,
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index f6ba78e..a6d20b0 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -295,6 +295,14 @@
  */
 void qemu_mutex_unlock_iothread(void);
 
+/*
+ * qemu_cond_wait_iothread: Wait on condition for the main loop mutex
+ *
+ * This function atomically releases the main loop mutex and causes
+ * the calling thread to block on the condition.
+ */
+void qemu_cond_wait_iothread(QemuCond *cond);
+
 /* internal interfaces */
 
 void qemu_fd_register(int fd);
diff --git a/include/sysemu/tpm.h b/include/sysemu/tpm.h
index 5b541a7..15979a3 100644
--- a/include/sysemu/tpm.h
+++ b/include/sysemu/tpm.h
@@ -45,11 +45,14 @@
 
 #define TYPE_TPM_TIS                "tpm-tis"
 #define TYPE_TPM_CRB                "tpm-crb"
+#define TYPE_TPM_SPAPR              "tpm-spapr"
 
 #define TPM_IS_TIS(chr)                             \
     object_dynamic_cast(OBJECT(chr), TYPE_TPM_TIS)
 #define TPM_IS_CRB(chr)                             \
     object_dynamic_cast(OBJECT(chr), TYPE_TPM_CRB)
+#define TPM_IS_SPAPR(chr)                           \
+    object_dynamic_cast(OBJECT(chr), TYPE_TPM_SPAPR)
 
 /* returns NULL unless there is exactly one TPM device */
 static inline TPMIf *tpm_find(void)
diff --git a/pc-bios/README b/pc-bios/README
index 269d99a..d6d33d2 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -4,9 +4,6 @@
 - The VGA BIOS and the Cirrus VGA BIOS come from the LGPL VGA bios
   project (http://www.nongnu.org/vgabios/).
 
-- The PowerPC Open Hack'Ware Open Firmware Compatible BIOS is
-  available at https://repo.or.cz/openhackware.git.
-
 - OpenBIOS (http://www.openbios.org/) is a free (GPL v2) portable
   firmware implementation. The goal is to implement a 100% IEEE
   1275-1994 (referred to as Open Firmware) compliant firmware.
diff --git a/pc-bios/ppc_rom.bin b/pc-bios/ppc_rom.bin
deleted file mode 100644
index 174a247..0000000
--- a/pc-bios/ppc_rom.bin
+++ /dev/null
Binary files differ
diff --git a/qapi/tpm.json b/qapi/tpm.json
index b30323b..63878aa 100644
--- a/qapi/tpm.json
+++ b/qapi/tpm.json
@@ -12,11 +12,11 @@
 #
 # @tpm-tis: TPM TIS model
 # @tpm-crb: TPM CRB model (since 2.12)
+# @tpm-spapr: TPM SPAPR model (since 5.0)
 #
 # Since: 1.5
 ##
-{ 'enum': 'TpmModel', 'data': [ 'tpm-tis', 'tpm-crb' ] }
-
+{ 'enum': 'TpmModel', 'data': [ 'tpm-tis', 'tpm-crb', 'tpm-spapr' ] }
 ##
 # @query-tpm-models:
 #
@@ -29,7 +29,7 @@
 # Example:
 #
 # -> { "execute": "query-tpm-models" }
-# <- { "return": [ "tpm-tis", "tpm-crb" ] }
+# <- { "return": [ "tpm-tis", "tpm-crb", "tpm-spapr" ] }
 #
 ##
 { 'command': 'query-tpm-models', 'returns': ['TpmModel'] }
diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi
index 3d2a8ff..ea3e10b 100644
--- a/qemu-deprecated.texi
+++ b/qemu-deprecated.texi
@@ -270,12 +270,6 @@
 These machine types are very old and likely can not be used for live migration
 from old QEMU versions anymore. A newer machine type should be used instead.
 
-@subsection prep (PowerPC) (since 3.1)
-
-This machine type uses an unmaintained firmware, broken in lots of ways,
-and unable to start post-2004 operating systems. 40p machine type should be
-used instead.
-
 @subsection spike_v1.9.1 and spike_v1.10 (since 4.1)
 
 The version specific Spike machines have been deprecated in favour of the
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 2328e7e..b79f1c3 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -1729,7 +1729,7 @@
 @section PowerPC System emulator
 @cindex system emulation (PowerPC)
 
-Use the executable @file{qemu-system-ppc} to simulate a complete PREP
+Use the executable @file{qemu-system-ppc} to simulate a complete 40P (PREP)
 or PowerMac PowerPC system.
 
 QEMU emulates the following PowerMac peripherals:
@@ -1749,7 +1749,7 @@
 VIA-CUDA with ADB keyboard and mouse.
 @end itemize
 
-QEMU emulates the following PREP peripherals:
+QEMU emulates the following 40P (PREP) peripherals:
 
 @itemize @minus
 @item
@@ -1761,7 +1761,7 @@
 @item
 Floppy disk
 @item
-NE2000 network adapters
+PCnet network adapters
 @item
 Serial port
 @item
@@ -1770,12 +1770,9 @@
 PC compatible keyboard and mouse.
 @end itemize
 
-QEMU uses the Open Hack'Ware Open Firmware Compatible BIOS available at
-@url{http://perso.magic.fr/l_indien/OpenHackWare/index.htm}.
-
 Since version 0.9.1, QEMU uses OpenBIOS @url{https://www.openbios.org/}
-for the g3beige and mac99 PowerMac machines. OpenBIOS is a free (GPL
-v2) portable firmware implementation. The goal is to implement a 100%
+for the g3beige and mac99 PowerMac and the 40p machines. OpenBIOS is a free
+(GPL v2) portable firmware implementation. The goal is to implement a 100%
 IEEE 1275-1994 (referred to as Open Firmware) compliant firmware.
 
 @c man begin OPTIONS
@@ -1798,8 +1795,6 @@
  -prom-env 'boot-args=conf=hd:2,\yaboot.conf'
 @end example
 
-These variables are not used by Open Hack'Ware.
-
 @end table
 
 @c man end
diff --git a/roms/openhackware b/roms/openhackware
deleted file mode 160000
index c559da7..0000000
--- a/roms/openhackware
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c559da7c8eec5e45ef1f67978827af6f0b9546f5
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 8ebeaba..3a1eb76 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -180,7 +180,7 @@
     POWERPC_EXCP_TRAP          = 0x40,
 };
 
-#define PPC_INPUT(env) (env->bus_model)
+#define PPC_INPUT(env) ((env)->bus_model)
 
 /*****************************************************************************/
 typedef struct opc_handler_t opc_handler_t;
@@ -397,6 +397,10 @@
 #define PSSCR_ESL         PPC_BIT(42) /* Enable State Loss */
 #define PSSCR_EC          PPC_BIT(43) /* Exit Criterion */
 
+/* HFSCR bits */
+#define HFSCR_MSGP     PPC_BIT(53) /* Privileged Message Send Facilities */
+#define HFSCR_IC_MSGP  0xA
+
 #define msr_sf   ((env->msr >> MSR_SF)   & 1)
 #define msr_isf  ((env->msr >> MSR_ISF)  & 1)
 #define msr_shv  ((env->msr >> MSR_SHV)  & 1)
@@ -1329,6 +1333,8 @@
 #endif
 
 void store_fpscr(CPUPPCState *env, uint64_t arg, uint32_t mask);
+void helper_hfscr_facility_check(CPUPPCState *env, uint32_t bit,
+                                 const char *caller, uint32_t cause);
 
 static inline uint64_t ppc_dump_gpr(CPUPPCState *env, int gprn)
 {
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 5752ed4..027f54c 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -473,6 +473,15 @@
         env->spr[SPR_FSCR] |= ((target_ulong)env->error_code << 56);
 #endif
         break;
+    case POWERPC_EXCP_HV_FU:     /* Hypervisor Facility Unavailable Exception */
+#ifdef TARGET_PPC64
+        env->spr[SPR_HFSCR] |= ((target_ulong)env->error_code << FSCR_IC_POS);
+        srr0 = SPR_HSRR0;
+        srr1 = SPR_HSRR1;
+        new_msr |= (target_ulong)MSR_HVB;
+        new_msr |= env->msr & ((target_ulong)1 << MSR_RI);
+#endif
+        break;
     case POWERPC_EXCP_PIT:       /* Programmable interval timer interrupt    */
         LOG_EXCP("PIT exception\n");
         break;
@@ -900,7 +909,11 @@
         }
         if (env->pending_interrupts & (1 << PPC_INTERRUPT_DOORBELL)) {
             env->pending_interrupts &= ~(1 << PPC_INTERRUPT_DOORBELL);
-            powerpc_excp(cpu, env->excp_model, POWERPC_EXCP_DOORI);
+            if (is_book3s_arch2x(env)) {
+                powerpc_excp(cpu, env->excp_model, POWERPC_EXCP_SDOOR);
+            } else {
+                powerpc_excp(cpu, env->excp_model, POWERPC_EXCP_DOORI);
+            }
             return;
         }
         if (env->pending_interrupts & (1 << PPC_INTERRUPT_HDOORBELL)) {
@@ -1221,39 +1234,30 @@
 }
 
 /* Server Processor Control */
-static int book3s_dbell2irq(target_ulong rb)
-{
-    int msg = rb & DBELL_TYPE_MASK;
 
+static bool dbell_type_server(target_ulong rb)
+{
     /*
      * A Directed Hypervisor Doorbell message is sent only if the
      * message type is 5. All other types are reserved and the
      * instruction is a no-op
      */
-    return msg == DBELL_TYPE_DBELL_SERVER ? PPC_INTERRUPT_HDOORBELL : -1;
+    return (rb & DBELL_TYPE_MASK) == DBELL_TYPE_DBELL_SERVER;
 }
 
 void helper_book3s_msgclr(CPUPPCState *env, target_ulong rb)
 {
-    int irq = book3s_dbell2irq(rb);
-
-    if (irq < 0) {
+    if (!dbell_type_server(rb)) {
         return;
     }
 
-    env->pending_interrupts &= ~(1 << irq);
+    env->pending_interrupts &= ~(1 << PPC_INTERRUPT_HDOORBELL);
 }
 
-void helper_book3s_msgsnd(target_ulong rb)
+static void book3s_msgsnd_common(int pir, int irq)
 {
-    int irq = book3s_dbell2irq(rb);
-    int pir = rb & DBELL_PROCIDTAG_MASK;
     CPUState *cs;
 
-    if (irq < 0) {
-        return;
-    }
-
     qemu_mutex_lock_iothread();
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -1267,6 +1271,49 @@
     }
     qemu_mutex_unlock_iothread();
 }
+
+void helper_book3s_msgsnd(target_ulong rb)
+{
+    int pir = rb & DBELL_PROCIDTAG_MASK;
+
+    if (!dbell_type_server(rb)) {
+        return;
+    }
+
+    book3s_msgsnd_common(pir, PPC_INTERRUPT_HDOORBELL);
+}
+
+#if defined(TARGET_PPC64)
+void helper_book3s_msgclrp(CPUPPCState *env, target_ulong rb)
+{
+    helper_hfscr_facility_check(env, HFSCR_MSGP, "msgclrp", HFSCR_IC_MSGP);
+
+    if (!dbell_type_server(rb)) {
+        return;
+    }
+
+    env->pending_interrupts &= ~(1 << PPC_INTERRUPT_DOORBELL);
+}
+
+/*
+ * sends a message to other threads that are on the same
+ * multi-threaded processor
+ */
+void helper_book3s_msgsndp(CPUPPCState *env, target_ulong rb)
+{
+    int pir = env->spr_cb[SPR_PIR].default_value;
+
+    helper_hfscr_facility_check(env, HFSCR_MSGP, "msgsndp", HFSCR_IC_MSGP);
+
+    if (!dbell_type_server(rb)) {
+        return;
+    }
+
+    /* TODO: TCG supports only one thread */
+
+    book3s_msgsnd_common(pir, PPC_INTERRUPT_DOORBELL);
+}
+#endif
 #endif
 
 void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index cd0dfe3..cfb4c07 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -657,6 +657,10 @@
 DEF_HELPER_FLAGS_1(load_purr, TCG_CALL_NO_RWG, tl, env)
 DEF_HELPER_FLAGS_2(store_purr, TCG_CALL_NO_RWG, void, env, tl)
 DEF_HELPER_2(store_ptcr, void, env, tl)
+DEF_HELPER_FLAGS_1(load_dpdes, TCG_CALL_NO_RWG, tl, env)
+DEF_HELPER_FLAGS_2(store_dpdes, TCG_CALL_NO_RWG, void, env, tl)
+DEF_HELPER_2(book3s_msgsndp, void, env, tl)
+DEF_HELPER_2(book3s_msgclrp, void, env, tl)
 #endif
 DEF_HELPER_2(store_sdr1, void, env, tl)
 DEF_HELPER_2(store_pidr, void, env, tl)
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 06fd0cc..7f44b1a 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -53,6 +53,9 @@
 
 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
 
+#define DEBUG_RETURN_GUEST 0
+#define DEBUG_RETURN_GDB   1
+
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     KVM_CAP_LAST_INFO
 };
@@ -1564,7 +1567,7 @@
 static int kvm_handle_hw_breakpoint(CPUState *cs,
                                     struct kvm_debug_exit_arch *arch_info)
 {
-    int handle = 0;
+    int handle = DEBUG_RETURN_GUEST;
     int n;
     int flag = 0;
 
@@ -1572,13 +1575,13 @@
         if (arch_info->status & KVMPPC_DEBUG_BREAKPOINT) {
             n = find_hw_breakpoint(arch_info->address, GDB_BREAKPOINT_HW);
             if (n >= 0) {
-                handle = 1;
+                handle = DEBUG_RETURN_GDB;
             }
         } else if (arch_info->status & (KVMPPC_DEBUG_WATCH_READ |
                                         KVMPPC_DEBUG_WATCH_WRITE)) {
             n = find_hw_watchpoint(arch_info->address,  &flag);
             if (n >= 0) {
-                handle = 1;
+                handle = DEBUG_RETURN_GDB;
                 cs->watchpoint_hit = &hw_watchpoint;
                 hw_watchpoint.vaddr = hw_debug_points[n].addr;
                 hw_watchpoint.flags = flag;
@@ -1590,12 +1593,12 @@
 
 static int kvm_handle_singlestep(void)
 {
-    return 1;
+    return DEBUG_RETURN_GDB;
 }
 
 static int kvm_handle_sw_breakpoint(void)
 {
-    return 1;
+    return DEBUG_RETURN_GDB;
 }
 
 static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
@@ -1647,7 +1650,7 @@
     env->error_code = POWERPC_EXCP_INVAL;
     ppc_cpu_do_interrupt(cs);
 
-    return 0;
+    return DEBUG_RETURN_GUEST;
 }
 
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
@@ -1702,6 +1705,13 @@
         ret = 0;
         break;
 
+#if defined(TARGET_PPC64)
+    case KVM_EXIT_NMI:
+        trace_kvm_handle_nmi_exception();
+        ret = kvm_handle_nmi(cpu, run);
+        break;
+#endif
+
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
         ret = -1;
@@ -2054,6 +2064,14 @@
     }
 }
 
+int kvmppc_set_fwnmi(void)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
+    CPUState *cs = CPU(cpu);
+
+    return kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_FWNMI, 0);
+}
+
 int kvmppc_smt_threads(void)
 {
     return cap_ppc_smt ? cap_ppc_smt : 1;
@@ -2789,6 +2807,19 @@
     return data & 0xffff;
 }
 
+#if defined(TARGET_PPC64)
+int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run)
+{
+    bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    spapr_mce_req_event(cpu, recovered);
+
+    return 0;
+}
+#endif
+
 int kvmppc_enable_hwrng(void)
 {
     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index b713097..9e4f235 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -27,6 +27,7 @@
 void kvmppc_set_papr(PowerPCCPU *cpu);
 int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr);
 void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy);
+int kvmppc_set_fwnmi(void);
 int kvmppc_smt_threads(void);
 void kvmppc_error_append_smt_possible_hint(Error *const *errp);
 int kvmppc_set_smt_threads(int smt);
@@ -83,6 +84,8 @@
 void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online);
 void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset);
 
+int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run);
+
 #else
 
 static inline uint32_t kvmppc_get_tbfreq(void)
@@ -160,6 +163,11 @@
 {
 }
 
+static inline int kvmppc_set_fwnmi(void)
+{
+    return -1;
+}
+
 static inline int kvmppc_smt_threads(void)
 {
     return 1;
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index e8e2a8a..98f5895 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -56,51 +56,138 @@
     }
 }
 
+static void *probe_contiguous(CPUPPCState *env, target_ulong addr, uint32_t nb,
+                              MMUAccessType access_type, int mmu_idx,
+                              uintptr_t raddr)
+{
+    void *host1, *host2;
+    uint32_t nb_pg1, nb_pg2;
+
+    nb_pg1 = -(addr | TARGET_PAGE_MASK);
+    if (likely(nb <= nb_pg1)) {
+        /* The entire operation is on a single page.  */
+        return probe_access(env, addr, nb, access_type, mmu_idx, raddr);
+    }
+
+    /* The operation spans two pages.  */
+    nb_pg2 = nb - nb_pg1;
+    host1 = probe_access(env, addr, nb_pg1, access_type, mmu_idx, raddr);
+    addr = addr_add(env, addr, nb_pg1);
+    host2 = probe_access(env, addr, nb_pg2, access_type, mmu_idx, raddr);
+
+    /* If the two host pages are contiguous, optimize.  */
+    if (host2 == host1 + nb_pg1) {
+        return host1;
+    }
+    return NULL;
+}
+
 void helper_lmw(CPUPPCState *env, target_ulong addr, uint32_t reg)
 {
-    for (; reg < 32; reg++) {
-        if (needs_byteswap(env)) {
-            env->gpr[reg] = bswap32(cpu_ldl_data_ra(env, addr, GETPC()));
-        } else {
-            env->gpr[reg] = cpu_ldl_data_ra(env, addr, GETPC());
+    uintptr_t raddr = GETPC();
+    int mmu_idx = cpu_mmu_index(env, false);
+    void *host = probe_contiguous(env, addr, (32 - reg) * 4,
+                                  MMU_DATA_LOAD, mmu_idx, raddr);
+
+    if (likely(host)) {
+        /* Fast path -- the entire operation is in RAM at host.  */
+        for (; reg < 32; reg++) {
+            env->gpr[reg] = (uint32_t)ldl_be_p(host);
+            host += 4;
         }
-        addr = addr_add(env, addr, 4);
+    } else {
+        /* Slow path -- at least some of the operation requires i/o.  */
+        for (; reg < 32; reg++) {
+            env->gpr[reg] = cpu_ldl_mmuidx_ra(env, addr, mmu_idx, raddr);
+            addr = addr_add(env, addr, 4);
+        }
     }
 }
 
 void helper_stmw(CPUPPCState *env, target_ulong addr, uint32_t reg)
 {
-    for (; reg < 32; reg++) {
-        if (needs_byteswap(env)) {
-            cpu_stl_data_ra(env, addr, bswap32((uint32_t)env->gpr[reg]),
-                                                   GETPC());
-        } else {
-            cpu_stl_data_ra(env, addr, (uint32_t)env->gpr[reg], GETPC());
+    uintptr_t raddr = GETPC();
+    int mmu_idx = cpu_mmu_index(env, false);
+    void *host = probe_contiguous(env, addr, (32 - reg) * 4,
+                                  MMU_DATA_STORE, mmu_idx, raddr);
+
+    if (likely(host)) {
+        /* Fast path -- the entire operation is in RAM at host.  */
+        for (; reg < 32; reg++) {
+            stl_be_p(host, env->gpr[reg]);
+            host += 4;
         }
-        addr = addr_add(env, addr, 4);
+    } else {
+        /* Slow path -- at least some of the operation requires i/o.  */
+        for (; reg < 32; reg++) {
+            cpu_stl_mmuidx_ra(env, addr, env->gpr[reg], mmu_idx, raddr);
+            addr = addr_add(env, addr, 4);
+        }
     }
 }
 
 static void do_lsw(CPUPPCState *env, target_ulong addr, uint32_t nb,
                    uint32_t reg, uintptr_t raddr)
 {
-    int sh;
+    int mmu_idx;
+    void *host;
+    uint32_t val;
 
-    for (; nb > 3; nb -= 4) {
-        env->gpr[reg] = cpu_ldl_data_ra(env, addr, raddr);
-        reg = (reg + 1) % 32;
-        addr = addr_add(env, addr, 4);
+    if (unlikely(nb == 0)) {
+        return;
     }
-    if (unlikely(nb > 0)) {
-        env->gpr[reg] = 0;
-        for (sh = 24; nb > 0; nb--, sh -= 8) {
-            env->gpr[reg] |= cpu_ldub_data_ra(env, addr, raddr) << sh;
-            addr = addr_add(env, addr, 1);
+
+    mmu_idx = cpu_mmu_index(env, false);
+    host = probe_contiguous(env, addr, nb, MMU_DATA_LOAD, mmu_idx, raddr);
+
+    if (likely(host)) {
+        /* Fast path -- the entire operation is in RAM at host.  */
+        for (; nb > 3; nb -= 4) {
+            env->gpr[reg] = (uint32_t)ldl_be_p(host);
+            reg = (reg + 1) % 32;
+            host += 4;
+        }
+        switch (nb) {
+        default:
+            return;
+        case 1:
+            val = ldub_p(host) << 24;
+            break;
+        case 2:
+            val = lduw_be_p(host) << 16;
+            break;
+        case 3:
+            val = (lduw_be_p(host) << 16) | (ldub_p(host + 2) << 8);
+            break;
+        }
+    } else {
+        /* Slow path -- at least some of the operation requires i/o.  */
+        for (; nb > 3; nb -= 4) {
+            env->gpr[reg] = cpu_ldl_mmuidx_ra(env, addr, mmu_idx, raddr);
+            reg = (reg + 1) % 32;
+            addr = addr_add(env, addr, 4);
+        }
+        switch (nb) {
+        default:
+            return;
+        case 1:
+            val = cpu_ldub_mmuidx_ra(env, addr, mmu_idx, raddr) << 24;
+            break;
+        case 2:
+            val = cpu_lduw_mmuidx_ra(env, addr, mmu_idx, raddr) << 16;
+            break;
+        case 3:
+            val = cpu_lduw_mmuidx_ra(env, addr, mmu_idx, raddr) << 16;
+            addr = addr_add(env, addr, 2);
+            val |= cpu_ldub_mmuidx_ra(env, addr, mmu_idx, raddr) << 8;
+            break;
         }
     }
+    env->gpr[reg] = val;
 }
 
-void helper_lsw(CPUPPCState *env, target_ulong addr, uint32_t nb, uint32_t reg)
+void helper_lsw(CPUPPCState *env, target_ulong addr,
+                uint32_t nb, uint32_t reg)
 {
     do_lsw(env, addr, nb, reg, GETPC());
 }
@@ -130,17 +217,57 @@
 void helper_stsw(CPUPPCState *env, target_ulong addr, uint32_t nb,
                  uint32_t reg)
 {
-    int sh;
+    uintptr_t raddr = GETPC();
+    int mmu_idx;
+    void *host;
+    uint32_t val;
 
-    for (; nb > 3; nb -= 4) {
-        cpu_stl_data_ra(env, addr, env->gpr[reg], GETPC());
-        reg = (reg + 1) % 32;
-        addr = addr_add(env, addr, 4);
+    if (unlikely(nb == 0)) {
+        return;
     }
-    if (unlikely(nb > 0)) {
-        for (sh = 24; nb > 0; nb--, sh -= 8) {
-            cpu_stb_data_ra(env, addr, (env->gpr[reg] >> sh) & 0xFF, GETPC());
-            addr = addr_add(env, addr, 1);
+
+    mmu_idx = cpu_mmu_index(env, false);
+    host = probe_contiguous(env, addr, nb, MMU_DATA_STORE, mmu_idx, raddr);
+
+    if (likely(host)) {
+        /* Fast path -- the entire operation is in RAM at host.  */
+        for (; nb > 3; nb -= 4) {
+            stl_be_p(host, env->gpr[reg]);
+            reg = (reg + 1) % 32;
+            host += 4;
+        }
+        val = env->gpr[reg];
+        switch (nb) {
+        case 1:
+            stb_p(host, val >> 24);
+            break;
+        case 2:
+            stw_be_p(host, val >> 16);
+            break;
+        case 3:
+            stw_be_p(host, val >> 16);
+            stb_p(host + 2, val >> 8);
+            break;
+        }
+    } else {
+        for (; nb > 3; nb -= 4) {
+            cpu_stl_mmuidx_ra(env, addr, env->gpr[reg], mmu_idx, raddr);
+            reg = (reg + 1) % 32;
+            addr = addr_add(env, addr, 4);
+        }
+        val = env->gpr[reg];
+        switch (nb) {
+        case 1:
+            cpu_stb_mmuidx_ra(env, addr, val >> 24, mmu_idx, raddr);
+            break;
+        case 2:
+            cpu_stw_mmuidx_ra(env, addr, val >> 16, mmu_idx, raddr);
+            break;
+        case 3:
+            cpu_stw_mmuidx_ra(env, addr, val >> 16, mmu_idx, raddr);
+            addr = addr_add(env, addr, 2);
+            cpu_stb_mmuidx_ra(env, addr, val >> 8, mmu_idx, raddr);
+            break;
         }
     }
 }
@@ -166,12 +293,12 @@
     addr &= mask;
 
     /* Check reservation */
-    if ((env->reserve_addr & mask) == (addr & mask))  {
+    if ((env->reserve_addr & mask) == addr)  {
         env->reserve_addr = (target_ulong)-1ULL;
     }
 
     /* Try fast path translate */
-    haddr = tlb_vaddr_to_host(env, addr, MMU_DATA_STORE, mmu_idx);
+    haddr = probe_write(env, addr, dcbz_size, mmu_idx, retaddr);
     if (haddr) {
         memset(haddr, 0, dcbz_size);
     } else {
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index 2318f3a..55b68d1 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -41,6 +41,18 @@
 }
 
 #ifdef TARGET_PPC64
+static void raise_hv_fu_exception(CPUPPCState *env, uint32_t bit,
+                                  const char *caller, uint32_t cause,
+                                  uintptr_t raddr)
+{
+    qemu_log_mask(CPU_LOG_INT, "HV Facility %d is unavailable (%s)\n",
+                  bit, caller);
+
+    env->spr[SPR_HFSCR] &= ~((target_ulong)FSCR_IC_MASK << FSCR_IC_POS);
+
+    raise_exception_err_ra(env, POWERPC_EXCP_HV_FU, cause, raddr);
+}
+
 static void raise_fu_exception(CPUPPCState *env, uint32_t bit,
                                uint32_t sprn, uint32_t cause,
                                uintptr_t raddr)
@@ -55,6 +67,17 @@
 }
 #endif
 
+void helper_hfscr_facility_check(CPUPPCState *env, uint32_t bit,
+                                 const char *caller, uint32_t cause)
+{
+#ifdef TARGET_PPC64
+    if ((env->msr_mask & MSR_HVB) && !msr_hv &&
+                                     !(env->spr[SPR_HFSCR] & (1UL << bit))) {
+        raise_hv_fu_exception(env, bit, caller, cause, GETPC());
+    }
+#endif
+}
+
 void helper_fscr_facility_check(CPUPPCState *env, uint32_t bit,
                                 uint32_t sprn, uint32_t cause)
 {
@@ -105,6 +128,46 @@
 
     env->spr[SPR_PCR] = value & pcc->pcr_mask;
 }
+
+/*
+ * DPDES register is shared. Each bit reflects the state of the
+ * doorbell interrupt of a thread of the same core.
+ */
+target_ulong helper_load_dpdes(CPUPPCState *env)
+{
+    target_ulong dpdes = 0;
+
+    helper_hfscr_facility_check(env, HFSCR_MSGP, "load DPDES", HFSCR_IC_MSGP);
+
+    /* TODO: TCG supports only one thread */
+    if (env->pending_interrupts & (1 << PPC_INTERRUPT_DOORBELL)) {
+        dpdes = 1;
+    }
+
+    return dpdes;
+}
+
+void helper_store_dpdes(CPUPPCState *env, target_ulong val)
+{
+    PowerPCCPU *cpu = env_archcpu(env);
+    CPUState *cs = CPU(cpu);
+
+    helper_hfscr_facility_check(env, HFSCR_MSGP, "store DPDES", HFSCR_IC_MSGP);
+
+    /* TODO: TCG supports only one thread */
+    if (val & ~0x1) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Invalid DPDES register value "
+                      TARGET_FMT_lx"\n", val);
+        return;
+    }
+
+    if (val & 0x1) {
+        env->pending_interrupts |= 1 << PPC_INTERRUPT_DOORBELL;
+        cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+    } else {
+        env->pending_interrupts &= ~(1 << PPC_INTERRUPT_DOORBELL);
+    }
+}
 #endif /* defined(TARGET_PPC64) */
 
 void helper_store_pidr(CPUPPCState *env, target_ulong val)
diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 066e324..224e646 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -235,6 +235,12 @@
         /* In real mode top 4 effective addr bits (mostly) ignored */
         raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
 
+        /* In HV mode, add HRMOR if top EA bit is clear */
+        if (msr_hv || !env->has_hv_mode) {
+            if (!(eaddr >> 63)) {
+                raddr |= env->spr[SPR_HRMOR];
+           }
+        }
         tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
                      PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx,
                      TARGET_PAGE_SIZE);
diff --git a/target/ppc/trace-events b/target/ppc/trace-events
index 3dc6740..6d15aa9 100644
--- a/target/ppc/trace-events
+++ b/target/ppc/trace-events
@@ -28,3 +28,4 @@
 kvm_handle_epr(void) "handle epr"
 kvm_handle_watchdog_expiry(void) "handle watchdog expiry"
 kvm_handle_debug_exception(void) "handle debug exception"
+kvm_handle_nmi_exception(void) "handle NMI exception"
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 9dcf8dc..36fa273 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6645,6 +6645,28 @@
 #endif /* defined(CONFIG_USER_ONLY) */
 }
 
+#if defined(TARGET_PPC64)
+static void gen_msgclrp(DisasContext *ctx)
+{
+#if defined(CONFIG_USER_ONLY)
+    GEN_PRIV;
+#else
+    CHK_SV;
+    gen_helper_book3s_msgclrp(cpu_env, cpu_gpr[rB(ctx->opcode)]);
+#endif /* defined(CONFIG_USER_ONLY) */
+}
+
+static void gen_msgsndp(DisasContext *ctx)
+{
+#if defined(CONFIG_USER_ONLY)
+    GEN_PRIV;
+#else
+    CHK_SV;
+    gen_helper_book3s_msgsndp(cpu_env, cpu_gpr[rB(ctx->opcode)]);
+#endif /* defined(CONFIG_USER_ONLY) */
+}
+#endif
+
 static void gen_msgsync(DisasContext *ctx)
 {
 #if defined(CONFIG_USER_ONLY)
@@ -7187,6 +7209,10 @@
 GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x00000000, PPC_NONE,
               PPC2_ISA300),
 GEN_HANDLER_E(maddld, 0x04, 0x19, 0xFF, 0x00000000, PPC_NONE, PPC2_ISA300),
+GEN_HANDLER2_E(msgsndp, "msgsndp", 0x1F, 0x0E, 0x04, 0x03ff0001,
+               PPC_NONE, PPC2_ISA207S),
+GEN_HANDLER2_E(msgclrp, "msgclrp", 0x1F, 0x0E, 0x05, 0x03ff0001,
+               PPC_NONE, PPC2_ISA207S),
 #endif
 
 #undef GEN_INT_ARITH_ADD
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index 2d3efad..53995f6 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -464,6 +464,17 @@
 {
     gen_helper_store_pcr(cpu_env, cpu_gpr[gprn]);
 }
+
+/* DPDES */
+static void spr_read_dpdes(DisasContext *ctx, int gprn, int sprn)
+{
+    gen_helper_load_dpdes(cpu_gpr[gprn], cpu_env);
+}
+
+static void spr_write_dpdes(DisasContext *ctx, int sprn, int gprn)
+{
+    gen_helper_store_dpdes(cpu_env, cpu_gpr[gprn]);
+}
 #endif
 #endif
 
@@ -8238,10 +8249,11 @@
 {
 #if !defined(CONFIG_USER_ONLY)
     /* Directed Privileged Door-bell Exception State, used for IPI */
-    spr_register(env, SPR_DPDES, "DPDES",
-                 SPR_NOACCESS, SPR_NOACCESS,
-                 &spr_read_generic, SPR_NOACCESS,
-                 0x00000000);
+    spr_register_kvm_hv(env, SPR_DPDES, "DPDES",
+                        SPR_NOACCESS, SPR_NOACCESS,
+                        &spr_read_dpdes, SPR_NOACCESS,
+                        &spr_read_dpdes, &spr_write_dpdes,
+                        KVM_REG_PPC_DPDES, 0x00000000);
 #endif
 }
 
diff --git a/tests/qtest/boot-order-test.c b/tests/qtest/boot-order-test.c
index a725bce..4a6218a 100644
--- a/tests/qtest/boot-order-test.c
+++ b/tests/qtest/boot-order-test.c
@@ -108,30 +108,6 @@
     test_boot_orders(NULL, read_boot_order_pc, test_cases_pc);
 }
 
-static uint8_t read_m48t59(QTestState *qts, uint64_t addr, uint16_t reg)
-{
-    qtest_writeb(qts, addr, reg & 0xff);
-    qtest_writeb(qts, addr + 1, reg >> 8);
-    return qtest_readb(qts, addr + 3);
-}
-
-static uint64_t read_boot_order_prep(QTestState *qts)
-{
-    return read_m48t59(qts, 0x80000000 + 0x74, 0x34);
-}
-
-static const boot_order_test test_cases_prep[] = {
-    { "", 'c', 'c' },
-    { "-boot c", 'c', 'c' },
-    { "-boot d", 'd', 'd' },
-    {}
-};
-
-static void test_prep_boot_order(void)
-{
-    test_boot_orders("prep", read_boot_order_prep, test_cases_prep);
-}
-
 static uint64_t read_boot_order_pmac(QTestState *qts)
 {
     QFWCFG *fw_cfg = mm_fw_cfg_init(qts, 0xf0000510);
@@ -190,7 +166,6 @@
     if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
         qtest_add_func("boot-order/pc", test_pc_boot_order);
     } else if (strcmp(arch, "ppc") == 0 || strcmp(arch, "ppc64") == 0) {
-        qtest_add_func("boot-order/prep", test_prep_boot_order);
         qtest_add_func("boot-order/pmac_oldworld",
                        test_pmac_oldworld_boot_order);
         qtest_add_func("boot-order/pmac_newworld",
diff --git a/tests/qtest/boot-serial-test.c b/tests/qtest/boot-serial-test.c
index 8e8c5b0..85a3614 100644
--- a/tests/qtest/boot-serial-test.c
+++ b/tests/qtest/boot-serial-test.c
@@ -15,6 +15,7 @@
 
 #include "qemu/osdep.h"
 #include "libqtest.h"
+#include "libqos/libqos-spapr.h"
 
 static const uint8_t kernel_mcf5208[] = {
     0x41, 0xf9, 0xfc, 0x06, 0x00, 0x00,     /* lea 0xfc060000,%a0 */
@@ -112,7 +113,7 @@
     { "ppc64", "40p", "-m 192", "Memory: 192M" },
     { "ppc64", "mac99", "", "PowerPC,970FX" },
     { "ppc64", "pseries",
-      "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken",
+      "-machine " PSERIES_DEFAULT_CAPABILITIES,
       "Open Firmware" },
     { "ppc64", "powernv8", "", "OPAL" },
     { "ppc64", "powernv9", "", "OPAL" },
diff --git a/tests/qtest/cdrom-test.c b/tests/qtest/cdrom-test.c
index 67635e3..833a050 100644
--- a/tests/qtest/cdrom-test.c
+++ b/tests/qtest/cdrom-test.c
@@ -189,7 +189,7 @@
         add_s390x_tests();
     } else if (g_str_equal(arch, "ppc64")) {
         const char *ppcmachines[] = {
-            "pseries", "mac99", "g3beige", "40p", "prep", NULL
+            "pseries", "mac99", "g3beige", "40p", NULL
         };
         add_cdrom_param_tests(ppcmachines);
     } else if (g_str_equal(arch, "sparc")) {
diff --git a/tests/qtest/endianness-test.c b/tests/qtest/endianness-test.c
index 5852795..2798802 100644
--- a/tests/qtest/endianness-test.c
+++ b/tests/qtest/endianness-test.c
@@ -35,7 +35,7 @@
     { "mips64", "malta", 0x10000000, .bswap = true },
     { "mips64el", "fulong2e", 0x1fd00000 },
     { "ppc", "g3beige", 0xfe000000, .bswap = true, .superio = "i82378" },
-    { "ppc", "prep", 0x80000000, .bswap = true },
+    { "ppc", "40p", 0x80000000, .bswap = true },
     { "ppc", "bamboo", 0xe8000000, .bswap = true, .superio = "i82378" },
     { "ppc64", "mac99", 0xf2000000, .bswap = true, .superio = "i82378" },
     { "ppc64", "pseries", (1ULL << 45), .bswap = true, .superio = "i82378" },
diff --git a/tests/qtest/libqos/libqos-spapr.h b/tests/qtest/libqos/libqos-spapr.h
index dcb5c43..d9c4c22 100644
--- a/tests/qtest/libqos/libqos-spapr.h
+++ b/tests/qtest/libqos/libqos-spapr.h
@@ -7,4 +7,12 @@
 QOSState *qtest_spapr_boot(const char *cmdline_fmt, ...);
 void qtest_spapr_shutdown(QOSState *qs);
 
+/* List of capabilities needed to silence warnings with TCG */
+#define PSERIES_DEFAULT_CAPABILITIES             \
+    "cap-cfpc=broken,"                           \
+    "cap-sbbc=broken,"                           \
+    "cap-ibs=broken,"                            \
+    "cap-ccf-assist=off,"                        \
+    "cap-fwnmi-mce=off"
+
 #endif
diff --git a/tests/qtest/prom-env-test.c b/tests/qtest/prom-env-test.c
index 9be52c7..60e6ec3 100644
--- a/tests/qtest/prom-env-test.c
+++ b/tests/qtest/prom-env-test.c
@@ -21,6 +21,7 @@
 
 #include "qemu/osdep.h"
 #include "libqtest.h"
+#include "libqos/libqos-spapr.h"
 
 #define MAGIC   0xcafec0de
 #define ADDRESS 0x4000
@@ -54,7 +55,7 @@
      */
     if (strcmp(machine, "pseries") == 0) {
         extra_args = "-nodefaults"
-            " -machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken";
+            " -machine " PSERIES_DEFAULT_CAPABILITIES;
     }
 
     qts = qtest_initf("-M %s -accel tcg %s -prom-env 'use-nvramrc?=true' "
diff --git a/tests/qtest/pxe-test.c b/tests/qtest/pxe-test.c
index f68d0aa..1161a77 100644
--- a/tests/qtest/pxe-test.c
+++ b/tests/qtest/pxe-test.c
@@ -17,6 +17,7 @@
 #include "qemu-common.h"
 #include "libqtest.h"
 #include "boot-sector.h"
+#include "libqos/libqos-spapr.h"
 
 #define NETNAME "net0"
 
@@ -46,15 +47,15 @@
 
 static testdef_t ppc64_tests[] = {
     { "pseries", "spapr-vlan",
-      "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,vsmt=8" },
+      "-machine vsmt=8," PSERIES_DEFAULT_CAPABILITIES },
     { "pseries", "virtio-net-pci",
-      "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,vsmt=8" },
+      "-machine vsmt=8," PSERIES_DEFAULT_CAPABILITIES },
     { NULL },
 };
 
 static testdef_t ppc64_tests_slow[] = {
     { "pseries", "e1000",
-      "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,vsmt=8" },
+      "-machine vsmt=8," PSERIES_DEFAULT_CAPABILITIES },
     { NULL },
 };