commit xen for openSUSE:Factory

5 Oct 2019

Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in at 2019-10-05 16:18:48
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.2352 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "xen"

Sat Oct  5 16:18:48 2019 rev:271 rq:734420 version:4.12.1_02

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes	2019-09-11 10:22:34.415494531 +0200
+++ /work/SRC/openSUSE:Factory/.xen.new.2352/xen.changes	2019-10-05 16:19:18.641592606 +0200
@@ -1,0 +2,62 @@
+Mon Sep 30 10:43:43 MDT 2019 - carnold@suse.com
+
+- bsc#1135799 - Partner-L3: Xen crashes on AMD ROME based machines
+  5ca7660f-x86-entry-drop-unused-includes.patch
+  5cf8da09-adjust-sysdom-creation-call-earlier-on-x86.patch
+  5cab2a6b-x86-ACPI-also-parse-AMD-tables-early.patch
+  5cab2ab7-x86-IOMMU-introduce-init-ops.patch
+  5cab2ae8-x86-IOMMU-abstract-iommu_supports_eim.patch
+  5cab2b4e-x86-IOMMU-abstract-iommu_enable_x2apic_IR.patch
+  5cab2b95-x86-IOMMU-initialize-iommu_ops-in.patch
+  5cac9a4b-x86-IOMMU-abstract-adjust_vtd_irq_affinities.patch
+  5cdeac7f-AMD-IOMMU-adjust-IOMMU-list-head-init.patch
+  5d0cf4e4-AMD-IOMMU-initialize-IRQ-tasklet-once.patch
+  5d149bb0-AMD-IOMMU-dont-add-IOMMUs.patch
+  5d1b3fab-AMD-IOMMU-restrict-feature-logging.patch
+  5d358508-x86-IRQ-desc-affinity-represents-request.patch
+  5d358534-x86-IRQ-consolidate-arch-cpu_mask-use.patch
+  5d358a67-AMD-IOMMU-pass-IOMMU-to-iterate_ivrs_entries-cb.patch
+  5d358a92-AMD-IOMMU-pass-IOMMU-to-amd_iommu_alloc_intremap_table.patch
+  5d39811c-x86-IOMMU-dont-restrict-IRQ-affinities.patch
+  5d417813-AMD-IOMMU-bitfield-extended-features.patch
+  5d417838-AMD-IOMMU-bitfield-control-reg.patch
+  5d41785b-AMD-IOMMU-bitfield-IRTE.patch
+  5d41787e-AMD-IOMMU-pass-IOMMU-to-gfu-intremap-entry.patch
+  5d4178ad-AMD-IOMMU-128bit-non-guest-APIC-IRTE.patch
+  5d4178fc-AMD-IOMMU-split-amd_iommu_init_one.patch
+  5d41793f-AMD-IOMMU-allow-enabling-without-IRQ.patch
+  5d417a16-AMD-IOMMU-adjust-IRQ-setup-for-x2APIC.patch
+  5d417ab6-AMD-IOMMU-enable-x2APIC-mode.patch
+  5d417b38-AMD-IOMMU-correct-IRTE-updating.patch
+  5d417b6a-AMD-IOMMU-dont-needlessly-log-headers.patch
+  5d4a9d25-AMD-IOMMU-drop-not-found-message.patch
+  5d80e7c0-AMD-IOMMU-free-shared-IRT-once.patch
+  5d80e80d-AMD-IOMMU-valid-flag-for-IVRS-mappings.patch
+  5d80e82e-AMD-IOMMU-alloc_intremap_table-callers-handle-errors.patch
+  5d80e857-x86-PCI-read-MSI-X-table-entry-count-early.patch
+  5d8b72e5-AMD-IOMMU-dont-blindly-alloc-intremap-tables.patch
+  5d8b730e-AMD-IOMMU-phantom-funcs-share-intremap-tables.patch
+  5d8b733b-x86-PCI-read-max-MSI-vector-count-early.patch
+  5d8b736d-AMD-IOMMU-replace-INTREMAP_ENTRIES.patch
+  5d8b7393-AMD-IOMMU-restrict-intremap-table-sizes.patch
+- bsc#1145240 - [Migration]Can't pre-allocate 1 shadow pages
+  5d70bfba-x86-shadow-dont-enable-with-too-small-allocation.patch
+- bsc#1137717 - [HPS Bug] Unable to install Windows Server 2016
+  with 2 CPUs setting (or above) under SLES12 SP4 Xen Server on AMD
+  ROME platform
+  5d89d8d9-libxc-x86-avoid-overflow-in-CPUID-APIC-ID.patch
+- Upstream bug fixes (bsc#1027519)
+  5d67ceaf-x86-properly-gate-PKU-clearing.patch
+  5d779811-x86-fix-CPUID7-0-eax-levelling-MSR.patch
+  5d77b40f-fix-hvm_all_ioreq_servers_add_vcpu-cleanup.patch
+  5d80ea13-vpci-honor-read-only-devices.patch
+  5d8b715f-ACPI-cpuidle-bump-max-num-of-states.patch
+
+-------------------------------------------------------------------
+Fri Sep 27 16:25:38 UTC 2019 - ohering@suse.de
+
+- bsc#1145774 - Libivrtd segfaults when trying to live migrate a VM
+  Fix crash in an error path of libxl_domain_suspend with
+  libxl.helper_done-crash.patch
+
+-------------------------------------------------------------------

New:
----
  5ca7660f-x86-entry-drop-unused-includes.patch
  5cab2a6b-x86-ACPI-also-parse-AMD-tables-early.patch
  5cab2ab7-x86-IOMMU-introduce-init-ops.patch
  5cab2ae8-x86-IOMMU-abstract-iommu_supports_eim.patch
  5cab2b4e-x86-IOMMU-abstract-iommu_enable_x2apic_IR.patch
  5cab2b95-x86-IOMMU-initialize-iommu_ops-in.patch
  5cac9a4b-x86-IOMMU-abstract-adjust_vtd_irq_affinities.patch
  5cdeac7f-AMD-IOMMU-adjust-IOMMU-list-head-init.patch
  5cf8da09-adjust-sysdom-creation-call-earlier-on-x86.patch
  5d0cf4e4-AMD-IOMMU-initialize-IRQ-tasklet-once.patch
  5d149bb0-AMD-IOMMU-dont-add-IOMMUs.patch
  5d1b3fab-AMD-IOMMU-restrict-feature-logging.patch
  5d358508-x86-IRQ-desc-affinity-represents-request.patch
  5d358534-x86-IRQ-consolidate-arch-cpu_mask-use.patch
  5d358a67-AMD-IOMMU-pass-IOMMU-to-iterate_ivrs_entries-cb.patch
  5d358a92-AMD-IOMMU-pass-IOMMU-to-amd_iommu_alloc_intremap_table.patch
  5d39811c-x86-IOMMU-dont-restrict-IRQ-affinities.patch
  5d417813-AMD-IOMMU-bitfield-extended-features.patch
  5d417838-AMD-IOMMU-bitfield-control-reg.patch
  5d41785b-AMD-IOMMU-bitfield-IRTE.patch
  5d41787e-AMD-IOMMU-pass-IOMMU-to-gfu-intremap-entry.patch
  5d4178ad-AMD-IOMMU-128bit-non-guest-APIC-IRTE.patch
  5d4178fc-AMD-IOMMU-split-amd_iommu_init_one.patch
  5d41793f-AMD-IOMMU-allow-enabling-without-IRQ.patch
  5d417a16-AMD-IOMMU-adjust-IRQ-setup-for-x2APIC.patch
  5d417ab6-AMD-IOMMU-enable-x2APIC-mode.patch
  5d417b38-AMD-IOMMU-correct-IRTE-updating.patch
  5d417b6a-AMD-IOMMU-dont-needlessly-log-headers.patch
  5d4a9d25-AMD-IOMMU-drop-not-found-message.patch
  5d67ceaf-x86-properly-gate-PKU-clearing.patch
  5d70bfba-x86-shadow-dont-enable-with-too-small-allocation.patch
  5d779811-x86-fix-CPUID7-0-eax-levelling-MSR.patch
  5d77b40f-fix-hvm_all_ioreq_servers_add_vcpu-cleanup.patch
  5d80e7c0-AMD-IOMMU-free-shared-IRT-once.patch
  5d80e80d-AMD-IOMMU-valid-flag-for-IVRS-mappings.patch
  5d80e82e-AMD-IOMMU-alloc_intremap_table-callers-handle-errors.patch
  5d80e857-x86-PCI-read-MSI-X-table-entry-count-early.patch
  5d80ea13-vpci-honor-read-only-devices.patch
  5d89d8d9-libxc-x86-avoid-overflow-in-CPUID-APIC-ID.patch
  5d8b715f-ACPI-cpuidle-bump-max-num-of-states.patch
  5d8b72e5-AMD-IOMMU-dont-blindly-alloc-intremap-tables.patch
  5d8b730e-AMD-IOMMU-phantom-funcs-share-intremap-tables.patch
  5d8b733b-x86-PCI-read-max-MSI-vector-count-early.patch
  5d8b736d-AMD-IOMMU-replace-INTREMAP_ENTRIES.patch
  5d8b7393-AMD-IOMMU-restrict-intremap-table-sizes.patch
  libxl.helper_done-crash.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.jOm0Z1/_old	2019-10-05 16:19:20.725587180 +0200
+++ /var/tmp/diff_new_pack.jOm0Z1/_new	2019-10-05 16:19:20.725587180 +0200
@@ -167,6 +167,52 @@
 Patch6:         5d516531-x86-xpti-dont-leak-TSS-adjacent-data.patch
 Patch7:         5d5bf475-x86-PV-fix-handling-of-iommu-mappings.patch
 Patch8:         5d6524ca-x86-mm-correctly-init-M2P-entries.patch
+Patch9:         5d67ceaf-x86-properly-gate-PKU-clearing.patch
+Patch10:        5d70bfba-x86-shadow-dont-enable-with-too-small-allocation.patch
+Patch11:        5d779811-x86-fix-CPUID7-0-eax-levelling-MSR.patch
+Patch12:        5d77b40f-fix-hvm_all_ioreq_servers_add_vcpu-cleanup.patch
+Patch13:        5d80ea13-vpci-honor-read-only-devices.patch
+Patch14:        5d89d8d9-libxc-x86-avoid-overflow-in-CPUID-APIC-ID.patch
+Patch15:        5d8b715f-ACPI-cpuidle-bump-max-num-of-states.patch
+# AMD x2APIC
+Patch301:       5ca7660f-x86-entry-drop-unused-includes.patch
+Patch302:       5cf8da09-adjust-sysdom-creation-call-earlier-on-x86.patch
+Patch303:       5cab2a6b-x86-ACPI-also-parse-AMD-tables-early.patch
+Patch304:       5cab2ab7-x86-IOMMU-introduce-init-ops.patch
+Patch305:       5cab2ae8-x86-IOMMU-abstract-iommu_supports_eim.patch
+Patch306:       5cab2b4e-x86-IOMMU-abstract-iommu_enable_x2apic_IR.patch
+Patch307:       5cab2b95-x86-IOMMU-initialize-iommu_ops-in.patch
+Patch308:       5cac9a4b-x86-IOMMU-abstract-adjust_vtd_irq_affinities.patch
+Patch309:       5cdeac7f-AMD-IOMMU-adjust-IOMMU-list-head-init.patch
+Patch310:       5d0cf4e4-AMD-IOMMU-initialize-IRQ-tasklet-once.patch
+Patch311:       5d149bb0-AMD-IOMMU-dont-add-IOMMUs.patch
+Patch312:       5d1b3fab-AMD-IOMMU-restrict-feature-logging.patch
+Patch313:       5d358508-x86-IRQ-desc-affinity-represents-request.patch
+Patch314:       5d358534-x86-IRQ-consolidate-arch-cpu_mask-use.patch
+Patch315:       5d358a67-AMD-IOMMU-pass-IOMMU-to-iterate_ivrs_entries-cb.patch
+Patch316:       5d358a92-AMD-IOMMU-pass-IOMMU-to-amd_iommu_alloc_intremap_table.patch
+Patch317:       5d39811c-x86-IOMMU-dont-restrict-IRQ-affinities.patch
+Patch318:       5d417813-AMD-IOMMU-bitfield-extended-features.patch
+Patch319:       5d417838-AMD-IOMMU-bitfield-control-reg.patch
+Patch320:       5d41785b-AMD-IOMMU-bitfield-IRTE.patch
+Patch321:       5d41787e-AMD-IOMMU-pass-IOMMU-to-gfu-intremap-entry.patch
+Patch322:       5d4178ad-AMD-IOMMU-128bit-non-guest-APIC-IRTE.patch
+Patch323:       5d4178fc-AMD-IOMMU-split-amd_iommu_init_one.patch
+Patch324:       5d41793f-AMD-IOMMU-allow-enabling-without-IRQ.patch
+Patch325:       5d417a16-AMD-IOMMU-adjust-IRQ-setup-for-x2APIC.patch
+Patch326:       5d417ab6-AMD-IOMMU-enable-x2APIC-mode.patch
+Patch327:       5d417b38-AMD-IOMMU-correct-IRTE-updating.patch
+Patch328:       5d417b6a-AMD-IOMMU-dont-needlessly-log-headers.patch
+Patch329:       5d4a9d25-AMD-IOMMU-drop-not-found-message.patch
+Patch330:       5d80e7c0-AMD-IOMMU-free-shared-IRT-once.patch
+Patch331:       5d80e80d-AMD-IOMMU-valid-flag-for-IVRS-mappings.patch
+Patch332:       5d80e82e-AMD-IOMMU-alloc_intremap_table-callers-handle-errors.patch
+Patch333:       5d80e857-x86-PCI-read-MSI-X-table-entry-count-early.patch
+Patch334:       5d8b72e5-AMD-IOMMU-dont-blindly-alloc-intremap-tables.patch
+Patch335:       5d8b730e-AMD-IOMMU-phantom-funcs-share-intremap-tables.patch
+Patch336:       5d8b733b-x86-PCI-read-max-MSI-vector-count-early.patch
+Patch337:       5d8b736d-AMD-IOMMU-replace-INTREMAP_ENTRIES.patch
+Patch338:       5d8b7393-AMD-IOMMU-restrict-intremap-table-sizes.patch
 # Our platform specific patches
 Patch400:       xen-destdir.patch
 Patch401:       vif-bridge-no-iptables.patch
@@ -202,6 +248,7 @@
 Patch466:       libxl.set-migration-constraints-from-cmdline.patch
 Patch467:       xenstore-run-in-studomain.patch
 Patch468:       libxl.prepare-environment-for-domcreate_stream_done.patch
+Patch469:       libxl.helper_done-crash.patch
 # python3 conversion patches
 Patch500:       build-python3-conversion.patch
 Patch501:       pygrub-python3-conversion.patch
@@ -373,6 +420,51 @@
 %patch6 -p1
 %patch7 -p1
 %patch8 -p1
+%patch9 -p1
+%patch10 -p1
+%patch11 -p1
+%patch12 -p1
+%patch13 -p1
+%patch14 -p1
+%patch15 -p1
+%patch301 -p1
+%patch302 -p1
+%patch303 -p1
+%patch304 -p1
+%patch305 -p1
+%patch306 -p1
+%patch307 -p1
+%patch308 -p1
+%patch309 -p1
+%patch310 -p1
+%patch311 -p1
+%patch312 -p1
+%patch313 -p1
+%patch314 -p1
+%patch315 -p1
+%patch316 -p1
+%patch317 -p1
+%patch318 -p1
+%patch319 -p1
+%patch320 -p1
+%patch321 -p1
+%patch322 -p1
+%patch323 -p1
+%patch324 -p1
+%patch325 -p1
+%patch326 -p1
+%patch327 -p1
+%patch328 -p1
+%patch329 -p1
+%patch330 -p1
+%patch331 -p1
+%patch332 -p1
+%patch333 -p1
+%patch334 -p1
+%patch335 -p1
+%patch336 -p1
+%patch337 -p1
+%patch338 -p1
 # Our platform specific patches
 %patch400 -p1
 %patch401 -p1
@@ -408,6 +500,7 @@
 %patch466 -p1
 %patch467 -p1
 %patch468 -p1
+%patch469 -p1
 # python3 conversion patches
 %patch500 -p1
 %patch501 -p1

++++++ 5ca7660f-x86-entry-drop-unused-includes.patch ++++++

References: bsc#1135799

# Commit 3f76e83c4cf6eeacc53f50d1d7bf7645497d94e9
# Date 2019-04-05 16:28:31 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/entry: drop unused header inclusions

I'm in particular after getting rid of asm/apicdef.h, but there are more
no longer (or perhaps never having been) used ones.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Reviewed-by: Kevin Tian 
Reviewed-by: Boris Ostrovsky 

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -19,13 +19,8 @@
 
         .file "svm/entry.S"
 
-#include 
-#include 
-#include 
 #include 
-#include 
 #include 
-#include 
 
 #define VMRUN  .byte 0x0F,0x01,0xD8
 #define STGI   .byte 0x0F,0x01,0xDC
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -18,13 +18,8 @@
 
         .file "vmx/entry.S"
 
-#include 
-#include 
-#include 
 #include 
-#include 
 #include 
-#include 
 
 #define VMRESUME     .byte 0x0f,0x01,0xc3
 #define VMLAUNCH     .byte 0x0f,0x01,0xc2
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -4,10 +4,7 @@
 
         .file "x86_64/compat/entry.S"
 
-#include 
-#include 
 #include 
-#include 
 #include 
 #include 
 #include 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -6,10 +6,7 @@
 
         .file "x86_64/entry.S"
 
-#include 
-#include 
 #include 
-#include 
 #include 
 #include 
 #include 
++++++ 5cab2a6b-x86-ACPI-also-parse-AMD-tables-early.patch ++++++

References: bsc#1135799

# Commit 9fa94e1058543759a7d45237f06c80cde3008d41
# Date 2019-04-08 13:03:07 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/ACPI: also parse AMD IOMMU tables early

In order to be able to initialize x2APIC mode we need to parse
respective ACPI tables early. Split amd_iov_detect() into two parts for
this purpose, and call the initial part earlier on.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -733,7 +733,7 @@ int __init acpi_boot_init(void)
 
 	acpi_mmcfg_init();
 
-	acpi_dmar_init();
+	acpi_iommu_init();
 
 	erst_init();
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "../ats.h"
@@ -170,7 +171,7 @@ static void amd_iommu_setup_domain_devic
     }
 }
 
-int __init amd_iov_detect(void)
+int __init acpi_ivrs_init(void)
 {
     INIT_LIST_HEAD(&amd_iommu_head);
 
@@ -184,6 +185,14 @@ int __init amd_iov_detect(void)
         return -ENODEV;
     }
 
+    return 0;
+}
+
+int __init amd_iov_detect(void)
+{
+    if ( !iommu_enable && !iommu_intremap )
+        return 0;
+
     iommu_ops = amd_iommu_ops;
 
     if ( amd_iommu_init() != 0 )
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -145,6 +146,15 @@ extern u32 pmtmr_ioport;
 extern unsigned int pmtmr_width;
 
 int acpi_dmar_init(void);
+int acpi_ivrs_init(void);
+
+static inline int acpi_iommu_init(void)
+{
+    int ret = acpi_dmar_init();
+
+    return ret == -ENODEV ? acpi_ivrs_init() : ret;
+}
+
 void acpi_mmcfg_init(void);
 
 /* Incremented whenever we transition through S3. Value is 1 during boot. */
++++++ 5cab2ab7-x86-IOMMU-introduce-init-ops.patch ++++++

References: bsc#1135799

# Commit 1b3cc8000c82edc9761c1e595928d6584e11f9f5
# Date 2019-04-08 13:04:23 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: introduce init-ops structure

Do away with the CPU vendor dependency, and set the init ops pointer
based on which ACPI tables have been found.

Also take the opportunity and add __read_mostly to iommu_ops.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Reviewed-by: Kevin Tian 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -30,6 +30,7 @@
 
 static bool_t __read_mostly init_done;
 
+static const struct iommu_init_ops _iommu_init_ops;
 static const struct iommu_ops amd_iommu_ops;
 
 struct amd_iommu *find_iommu_for_device(int seg, int bdf)
@@ -185,10 +186,12 @@ int __init acpi_ivrs_init(void)
         return -ENODEV;
     }
 
+    iommu_init_ops = &_iommu_init_ops;
+
     return 0;
 }
 
-int __init amd_iov_detect(void)
+static int __init iov_detect(void)
 {
     if ( !iommu_enable && !iommu_intremap )
         return 0;
@@ -604,3 +607,7 @@ static const struct iommu_ops __initcons
     .crash_shutdown = amd_iommu_crash_shutdown,
     .dump_p2m_table = amd_dump_p2m_table,
 };
+
+static const struct iommu_init_ops __initconstrel _iommu_init_ops = {
+    .setup = iov_detect,
+};
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -993,7 +993,11 @@ int __init acpi_dmar_init(void)
     ret = parse_dmar_table(acpi_parse_dmar);
 
     if ( !ret )
+    {
+        iommu_init_ops = &intel_iommu_init_ops;
+
         return add_user_rmrr();
+    }
 
     return ret;
 }
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -27,6 +27,7 @@
 
 struct pci_ats_dev;
 extern bool_t rwbf_quirk;
+extern const struct iommu_init_ops intel_iommu_init_ops;
 extern const struct iommu_ops intel_iommu_ops;
 
 void print_iommu_regs(struct acpi_drhd_unit *drhd);
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2280,7 +2280,7 @@ static void __hwdom_init setup_hwdom_rmr
     pcidevs_unlock();
 }
 
-int __init intel_vtd_setup(void)
+static int __init vtd_setup(void)
 {
     struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
@@ -2735,6 +2735,10 @@ const struct iommu_ops __initconstrel in
     .dump_p2m_table = vtd_dump_p2m_table,
 };
 
+const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
+    .setup = vtd_setup,
+};
+
 /*
  * Local variables:
  * mode: C
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -23,7 +23,8 @@
 #include 
 #include 
 
-struct iommu_ops iommu_ops;
+const struct iommu_init_ops *__initdata iommu_init_ops;
+struct iommu_ops __read_mostly iommu_ops;
 
 void iommu_update_ire_from_apic(
     unsigned int apic, unsigned int reg, unsigned int value)
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -56,9 +56,6 @@ struct arch_iommu
     struct guest_iommu *g_iommu;
 };
 
-int intel_vtd_setup(void);
-int amd_iov_detect(void);
-
 extern struct iommu_ops iommu_ops;
 
 static inline const struct iommu_ops *iommu_get_ops(void)
@@ -67,17 +64,15 @@ static inline const struct iommu_ops *io
     return &iommu_ops;
 }
 
+struct iommu_init_ops {
+    int (*setup)(void);
+};
+
+extern const struct iommu_init_ops *iommu_init_ops;
+
 static inline int iommu_hardware_setup(void)
 {
-    switch ( boot_cpu_data.x86_vendor )
-    {
-    case X86_VENDOR_INTEL:
-        return intel_vtd_setup();
-    case X86_VENDOR_AMD:
-        return amd_iov_detect();
-    }
-
-    return -ENODEV;
+    return iommu_init_ops ? iommu_init_ops->setup() : -ENODEV;
 }
 
 /* Are we using the domain P2M table as its IOMMU pagetable? */
++++++ 5cab2ae8-x86-IOMMU-abstract-iommu_supports_eim.patch ++++++

References: bsc#1135799

# Commit cd7680326a51d9e65ec8a966dfad4ca24cf5d4df
# Date 2019-04-08 13:05:12 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: abstract Intel-specific iommu_supports_eim()

Introduce a respective element in struct iommu_init_ops.

Take the liberty and also switch intel_iommu_supports_eim() to bool/
true/false, to fully match the hook's type.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Reviewed-by: Kevin Tian 

--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -898,14 +898,14 @@ void __init x2apic_bsp_setup(void)
         printk("x2APIC: Already enabled by BIOS: Ignoring cmdline disable.\n");
     }
 
-    if ( !iommu_supports_eim() )
+    if ( !iommu_supports_x2apic() )
     {
         if ( !x2apic_enabled )
         {
-            printk("Not enabling x2APIC: depends on iommu_supports_eim.\n");
+            printk("Not enabling x2APIC: depends on IOMMU support\n");
             return;
         }
-        panic("x2APIC: already enabled by BIOS, but iommu_supports_eim failed\n");
+        panic("x2APIC: already enabled by BIOS, but no IOMMU support\n");
     }
 
     if ( (ioapic_entries = alloc_ioapic_entries()) == NULL )
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -34,6 +34,8 @@ void print_iommu_regs(struct acpi_drhd_u
 void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn);
 keyhandler_fn_t vtd_dump_iommu_info;
 
+bool intel_iommu_supports_eim(void);
+
 int enable_qinval(struct iommu *iommu);
 void disable_qinval(struct iommu *iommu);
 int enable_intremap(struct iommu *iommu, int eim);
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -142,13 +142,13 @@ static void set_hpet_source_id(unsigned
     set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, hpetid_to_bdf(id));
 }
 
-bool_t __init iommu_supports_eim(void)
+bool __init intel_iommu_supports_eim(void)
 {
     struct acpi_drhd_unit *drhd;
     unsigned int apic;
 
     if ( !iommu_qinval || !iommu_intremap || list_empty(&acpi_drhd_units) )
-        return 0;
+        return false;
 
     /* We MUST have a DRHD unit for each IOAPIC. */
     for ( apic = 0; apic < nr_ioapics; apic++ )
@@ -157,16 +157,16 @@ bool_t __init iommu_supports_eim(void)
             dprintk(XENLOG_WARNING VTDPREFIX,
                     "There is not a DRHD for IOAPIC %#x (id: %#x)!\n",
                     apic, IO_APIC_ID(apic));
-            return 0;
+            return false;
         }
 
     for_each_drhd_unit ( drhd )
         if ( !ecap_queued_inval(drhd->iommu->ecap) ||
              !ecap_intr_remap(drhd->iommu->ecap) ||
              !ecap_eim(drhd->iommu->ecap) )
-            return 0;
+            return false;
 
-    return 1;
+    return true;
 }
 
 /*
@@ -889,7 +889,7 @@ int iommu_enable_x2apic_IR(void)
 
     if ( system_state < SYS_STATE_active )
     {
-        if ( !iommu_supports_eim() )
+        if ( !intel_iommu_supports_eim() )
             return -EOPNOTSUPP;
 
         if ( !platform_supports_x2apic() )
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2737,6 +2737,7 @@ const struct iommu_ops __initconstrel in
 
 const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
     .setup = vtd_setup,
+    .supports_x2apic = intel_iommu_supports_eim,
 };
 
 /*
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -66,6 +66,7 @@ static inline const struct iommu_ops *io
 
 struct iommu_init_ops {
     int (*setup)(void);
+    bool (*supports_x2apic)(void);
 };
 
 extern const struct iommu_init_ops *iommu_init_ops;
@@ -87,7 +88,14 @@ int iommu_setup_hpet_msi(struct msi_desc
 int adjust_vtd_irq_affinities(void);
 int __must_check iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte,
                                  int order, int present);
-bool_t iommu_supports_eim(void);
+
+static inline bool iommu_supports_x2apic(void)
+{
+    return iommu_init_ops && iommu_init_ops->supports_x2apic
+           ? iommu_init_ops->supports_x2apic()
+           : false;
+}
+
 int iommu_enable_x2apic_IR(void);
 void iommu_disable_x2apic_IR(void);
 
++++++ 5cab2b4e-x86-IOMMU-abstract-iommu_enable_x2apic_IR.patch ++++++

References: bsc#1135799

# Commit 6d786fdbcdd5dfa0197719d8607a1fcc039d8bda
# Date 2019-04-08 13:06:54 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: abstract Intel-specific iommu_{en,dis}able_x2apic_IR()

Introduce respective elements in struct iommu_init_ops as well as a
pointer to the main ops structure.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Andrew Cooper 

--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -510,7 +510,7 @@ static void resume_x2apic(void)
     mask_8259A();
     mask_IO_APIC_setup(ioapic_entries);
 
-    iommu_enable_x2apic_IR();
+    iommu_enable_x2apic();
     __enable_x2apic();
 
     restore_IO_APIC_setup(ioapic_entries);
@@ -720,7 +720,7 @@ int lapic_suspend(void)
 
     local_irq_save(flags);
     disable_local_APIC();
-    iommu_disable_x2apic_IR();
+    iommu_disable_x2apic();
     local_irq_restore(flags);
     return 0;
 }
@@ -923,7 +923,7 @@ void __init x2apic_bsp_setup(void)
     mask_8259A();
     mask_IO_APIC_setup(ioapic_entries);
 
-    switch ( iommu_enable_x2apic_IR() )
+    switch ( iommu_enable_x2apic() )
     {
     case 0:
         break;
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -35,6 +35,8 @@ void print_vtd_entries(struct iommu *iom
 keyhandler_fn_t vtd_dump_iommu_info;
 
 bool intel_iommu_supports_eim(void);
+int intel_iommu_enable_eim(void);
+void intel_iommu_disable_eim(void);
 
 int enable_qinval(struct iommu *iommu);
 void disable_qinval(struct iommu *iommu);
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -882,23 +882,13 @@ out:
  * This function is used to enable Interrupt remapping when
  * enable x2apic
  */
-int iommu_enable_x2apic_IR(void)
+int intel_iommu_enable_eim(void)
 {
     struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
 
-    if ( system_state < SYS_STATE_active )
-    {
-        if ( !intel_iommu_supports_eim() )
-            return -EOPNOTSUPP;
-
-        if ( !platform_supports_x2apic() )
-            return -ENXIO;
-
-        iommu_ops = intel_iommu_ops;
-    }
-    else if ( !x2apic_enabled )
-        return -EOPNOTSUPP;
+    if ( system_state < SYS_STATE_active && !platform_supports_x2apic() )
+        return -ENXIO;
 
     for_each_drhd_unit ( drhd )
     {
@@ -943,17 +933,13 @@ int iommu_enable_x2apic_IR(void)
 }
 
 /*
- * This function is used to disable Interrutp remapping when
+ * This function is used to disable Interrupt remapping when
  * suspend local apic
  */
-void iommu_disable_x2apic_IR(void)
+void intel_iommu_disable_eim(void)
 {
     struct acpi_drhd_unit *drhd;
 
-    /* x2apic_enabled implies iommu_supports_eim(). */
-    if ( !x2apic_enabled )
-        return;
-
     for_each_drhd_unit ( drhd )
         disable_intremap(drhd->iommu);
 
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2720,6 +2720,8 @@ const struct iommu_ops __initconstrel in
     .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
+    .enable_x2apic = intel_iommu_enable_eim,
+    .disable_x2apic = intel_iommu_disable_eim,
     .update_ire_from_apic = io_apic_write_remap_rte,
     .update_ire_from_msi = msi_msg_write_remap_rte,
     .read_apic_from_ire = io_apic_read_remap_rte,
@@ -2736,6 +2738,7 @@ const struct iommu_ops __initconstrel in
 };
 
 const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
+    .ops = &intel_iommu_ops,
     .setup = vtd_setup,
     .supports_x2apic = intel_iommu_supports_eim,
 };
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -26,6 +26,24 @@
 const struct iommu_init_ops *__initdata iommu_init_ops;
 struct iommu_ops __read_mostly iommu_ops;
 
+int iommu_enable_x2apic(void)
+{
+    if ( system_state < SYS_STATE_active )
+    {
+        if ( !iommu_supports_x2apic() )
+            return -EOPNOTSUPP;
+
+        iommu_ops = *iommu_init_ops->ops;
+    }
+    else if ( !x2apic_enabled )
+        return -EOPNOTSUPP;
+
+    if ( !iommu_ops.enable_x2apic )
+        return -EOPNOTSUPP;
+
+    return iommu_ops.enable_x2apic();
+}
+
 void iommu_update_ire_from_apic(
     unsigned int apic, unsigned int reg, unsigned int value)
 {
--- a/xen/include/asm-x86/apic.h
+++ b/xen/include/asm-x86/apic.h
@@ -29,7 +29,6 @@ enum apic_mode {
 };
 
 extern u8 apic_verbosity;
-extern bool x2apic_enabled;
 extern bool directed_eoi_enabled;
 
 void check_x2apic_preenabled(void);
--- a/xen/include/asm-x86/apicdef.h
+++ b/xen/include/asm-x86/apicdef.h
@@ -126,4 +126,6 @@
 
 #define MAX_IO_APICS 128
 
+extern bool x2apic_enabled;
+
 #endif
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -65,6 +66,7 @@ static inline const struct iommu_ops *io
 }
 
 struct iommu_init_ops {
+    const struct iommu_ops *ops;
     int (*setup)(void);
     bool (*supports_x2apic)(void);
 };
@@ -96,8 +98,13 @@ static inline bool iommu_supports_x2apic
            : false;
 }
 
-int iommu_enable_x2apic_IR(void);
-void iommu_disable_x2apic_IR(void);
+int iommu_enable_x2apic(void);
+
+static inline void iommu_disable_x2apic(void)
+{
+    if ( x2apic_enabled && iommu_ops.disable_x2apic )
+        iommu_ops.disable_x2apic();
+}
 
 extern bool untrusted_msi;
 
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -216,11 +216,16 @@ struct iommu_ops {
                                     unsigned int *flags);
 
     void (*free_page_table)(struct page_info *);
+
 #ifdef CONFIG_X86
+    int (*enable_x2apic)(void);
+    void (*disable_x2apic)(void);
+
     void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value);
     unsigned int (*read_apic_from_ire)(unsigned int apic, unsigned int reg);
     int (*setup_hpet_msi)(struct msi_desc *);
 #endif /* CONFIG_X86 */
+
     int __must_check (*suspend)(void);
     void (*resume)(void);
     void (*share_p2m)(struct domain *d);
++++++ 5cab2b95-x86-IOMMU-initialize-iommu_ops-in.patch ++++++

References: bsc#1135799

# Commit 19127340a504c030901fc16d8475fc7d8cfdf8a5
# Date 2019-04-08 13:08:05 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: initialize iommu_ops in vendor-independent code

Move this into iommu_hardware_setup() and make that function non-
inline. Move its declaration into common code.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -31,7 +31,6 @@
 static bool_t __read_mostly init_done;
 
 static const struct iommu_init_ops _iommu_init_ops;
-static const struct iommu_ops amd_iommu_ops;
 
 struct amd_iommu *find_iommu_for_device(int seg, int bdf)
 {
@@ -196,8 +195,6 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    iommu_ops = amd_iommu_ops;
-
     if ( amd_iommu_init() != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
@@ -582,7 +579,7 @@ static void amd_dump_p2m_table(struct do
     amd_dump_p2m_table_level(hd->arch.root_table, hd->arch.paging_mode, 0, 0);
 }
 
-static const struct iommu_ops __initconstrel amd_iommu_ops = {
+static const struct iommu_ops __initconstrel _iommu_ops = {
     .init = amd_iommu_domain_init,
     .hwdom_init = amd_iommu_hwdom_init,
     .add_device = amd_iommu_add_device,
@@ -609,5 +606,6 @@ static const struct iommu_ops __initcons
 };
 
 static const struct iommu_init_ops __initconstrel _iommu_init_ops = {
+    .ops = &_iommu_ops,
     .setup = iov_detect,
 };
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2305,8 +2305,6 @@ static int __init vtd_setup(void)
         goto error;
     }
 
-    iommu_ops = intel_iommu_ops;
-
     /* We enable the following features only if they are supported by all VT-d
      * engines: Snoop Control, DMA passthrough, Queued Invalidation, Interrupt
      * Remapping, and Posted Interrupt
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -26,6 +26,20 @@
 const struct iommu_init_ops *__initdata iommu_init_ops;
 struct iommu_ops __read_mostly iommu_ops;
 
+int __init iommu_hardware_setup(void)
+{
+    if ( !iommu_init_ops )
+        return -ENODEV;
+
+    if ( !iommu_ops.init )
+        iommu_ops = *iommu_init_ops->ops;
+    else
+        /* x2apic setup may have previously initialised the struct. */
+        ASSERT(iommu_ops.init == iommu_init_ops->ops->init);
+
+    return iommu_init_ops->setup();
+}
+
 int iommu_enable_x2apic(void)
 {
     if ( system_state < SYS_STATE_active )
--- a/xen/include/asm-arm/iommu.h
+++ b/xen/include/asm-arm/iommu.h
@@ -26,8 +26,6 @@ struct arch_iommu
 const struct iommu_ops *iommu_get_ops(void);
 void iommu_set_ops(const struct iommu_ops *ops);
 
-int iommu_hardware_setup(void);
-
 #endif /* __ARCH_ARM_IOMMU_H__ */
 
 /*
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -73,11 +73,6 @@ struct iommu_init_ops {
 
 extern const struct iommu_init_ops *iommu_init_ops;
 
-static inline int iommu_hardware_setup(void)
-{
-    return iommu_init_ops ? iommu_init_ops->setup() : -ENODEV;
-}
-
 /* Are we using the domain P2M table as its IOMMU pagetable? */
 #define iommu_use_hap_pt(d) \
     (hap_enabled(d) && has_iommu_pt(d) && iommu_hap_pt_share)
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -65,6 +65,7 @@ extern int8_t iommu_hwdom_reserved;
 extern unsigned int iommu_dev_iotlb_timeout;
 
 int iommu_setup(void);
+int iommu_hardware_setup(void);
 
 int iommu_domain_init(struct domain *d);
 void iommu_hwdom_init(struct domain *d);
++++++ 5cac9a4b-x86-IOMMU-abstract-adjust_vtd_irq_affinities.patch ++++++

References: bsc#1135799

# Commit 5e5868724e35020ea5edbc10c715823a99887f3f
# Date 2019-04-09 15:12:43 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: abstract Intel-specific adjust_vtd_irq_affinities()

This can't be folded into the resume hook, as that runs before bringing
back up APs, but the affinity adjustment wants to happen with all CPUs
back online. Hence a separate hook is needed such that AMD can then
leverage it as well.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Andrew Cooper 

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -275,7 +275,7 @@ static int enter_state(u32 state)
     mtrr_aps_sync_begin();
     enable_nonboot_cpus();
     mtrr_aps_sync_end();
-    adjust_vtd_irq_affinities();
+    iommu_adjust_irq_affinities();
     acpi_dmar_zap();
     thaw_domains();
     system_state = SYS_STATE_active;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2141,7 +2141,7 @@ static void adjust_irq_affinity(struct a
     dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
 }
 
-int adjust_vtd_irq_affinities(void)
+static int adjust_vtd_irq_affinities(void)
 {
     struct acpi_drhd_unit *drhd;
 
@@ -2725,6 +2725,7 @@ const struct iommu_ops __initconstrel in
     .read_apic_from_ire = io_apic_read_remap_rte,
     .read_msi_from_ire = msi_msg_read_remap_rte,
     .setup_hpet_msi = intel_setup_hpet_msi,
+    .adjust_irq_affinities = adjust_vtd_irq_affinities,
     .suspend = vtd_suspend,
     .resume = vtd_resume,
     .share_p2m = iommu_set_pgd,
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -81,8 +81,14 @@ void iommu_update_ire_from_apic(unsigned
 unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg);
 int iommu_setup_hpet_msi(struct msi_desc *);
 
+static inline int iommu_adjust_irq_affinities(void)
+{
+    return iommu_ops.adjust_irq_affinities
+           ? iommu_ops.adjust_irq_affinities()
+           : 0;
+}
+
 /* While VT-d specific, this must get declared in a generic header. */
-int adjust_vtd_irq_affinities(void);
 int __must_check iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte,
                                  int order, int present);
 
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -224,7 +224,10 @@ struct iommu_ops {
 
     void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value);
     unsigned int (*read_apic_from_ire)(unsigned int apic, unsigned int reg);
+
     int (*setup_hpet_msi)(struct msi_desc *);
+
+    int (*adjust_irq_affinities)(void);
 #endif /* CONFIG_X86 */
 
     int __must_check (*suspend)(void);
++++++ 5cdeac7f-AMD-IOMMU-adjust-IOMMU-list-head-init.patch ++++++
# Commit f6c1247184b6a5cfa57491f0ec4483896789fda6
# Date 2019-05-17 14:43:43 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: adjust IOMMU list head initialization

Do this statically, which will allow accessing the (empty) list even
without having come through acpi_ivrs_init().

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -36,7 +36,7 @@ static struct tasklet amd_iommu_irq_task
 unsigned int __read_mostly ivrs_bdf_entries;
 u8 __read_mostly ivhd_type;
 static struct radix_tree_root ivrs_maps;
-struct list_head amd_iommu_head;
+LIST_HEAD_READ_MOSTLY(amd_iommu_head);
 struct table_struct device_table;
 bool_t iommuv2_enabled;
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -173,8 +173,6 @@ static void amd_iommu_setup_domain_devic
 
 int __init acpi_ivrs_init(void)
 {
-    INIT_LIST_HEAD(&amd_iommu_head);
-
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
++++++ 5cf8da09-adjust-sysdom-creation-call-earlier-on-x86.patch ++++++

References: bsc#1135799

# Commit 7177f589ba84433e1ca8bb97a5d074545133c49c
# Date 2019-06-06 11:16:57 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
adjust system domain creation (and call it earlier on x86)

Split out this mostly arch-independent code into a common-code helper
function. (This does away with Arm's arch_init_memory() altogether.)

On x86 this needs to happen before acpi_boot_init(): Commit 9fa94e1058
("x86/ACPI: also parse AMD IOMMU tables early") only appeared to work
fine - it's really broken, and doesn't crash (on non-EFI AMD systems)
only because of there being a mapping of linear address 0 during early
boot. On EFI there is:

 Early fatal page fault at e008:ffff82d08024d58e (cr2=0000000000000220, ec=0000)
 ----[ Xen-4.13-unstable  x86_64  debug=y   Not tainted ]----
 CPU:    0
 RIP:    e008:[<ffff82d08024d58e>] pci.c#_pci_hide_device+0x17/0x3a
 RFLAGS: 0000000000010046   CONTEXT: hypervisor
 rax: 0000000000000000   rbx: 0000000000006000   rcx: 0000000000000000
 rdx: ffff83104f2ee9b0   rsi: ffff82e0209e5d48   rdi: ffff83104f2ee9a0
 rbp: ffff82d08081fce0   rsp: ffff82d08081fcb8   r8:  0000000000000000
 r9:  8000000000000000   r10: 0180000000000000   r11: 7fffffffffffffff
 r12: ffff83104f2ee9a0   r13: 0000000000000002   r14: ffff83104f2ee4b0
 r15: 0000000000000064   cr0: 0000000080050033   cr4: 00000000000000a0
 cr3: 000000009f614000   cr2: 0000000000000220
 fsb: 0000000000000000   gsb: 0000000000000000   gss: 0000000000000000
 ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
 Xen code around <ffff82d08024d58e> (pci.c#_pci_hide_device+0x17/0x3a):
  48 89 47 38 48 8d 57 10 <48> 8b 88 20 02 00 00 48 89 51 08 48 89 4f 10 48
 Xen stack trace from rsp=ffff82d08081fcb8:
[...]
 Xen call trace:
    [<ffff82d08024d58e>] pci.c#_pci_hide_device+0x17/0x3a
[   [<                >] pci_ro_device+...]
    [<ffff82d080617fe1>] amd_iommu_detect_one_acpi+0x161/0x249
    [<ffff82d0806186ac>] iommu_acpi.c#detect_iommu_acpi+0xb5/0xe7
    [<ffff82d08061cde0>] acpi_table_parse+0x61/0x90
    [<ffff82d080619e7d>] amd_iommu_detect_acpi+0x17/0x19
    [<ffff82d08061790b>] acpi_ivrs_init+0x20/0x5b
    [<ffff82d08062e838>] acpi_boot_init+0x301/0x30f
    [<ffff82d080628b10>] __start_xen+0x1daf/0x28a2

 Pagetable walk from 0000000000000220:
  L4[0x000] = 000000009f44f063 ffffffffffffffff
  L3[0x000] = 000000009f44b063 ffffffffffffffff
  L2[0x000] = 0000000000000000 ffffffffffffffff

 ****************************************
 Panic on CPU 0:
 FATAL TRAP: vector = 14 (page fault)
 [error_code=0000] , IN INTERRUPT CONTEXT
 ****************************************

Of course the bug would nevertheless have lead to post-boot crashes as
soon as the list would actually get traversed.

Take the opportunity and
- convert BUG_ON()s being moved to panic(),
- add __read_mostly annotations to the dom_* definitions.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Julien Grall 

--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -44,8 +44,6 @@
 #include 
 #include 
 
-struct domain *dom_xen, *dom_io, *dom_cow;
-
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef virt_to_mfn
 #define virt_to_mfn(va) _mfn(__virt_to_mfn(va))
@@ -515,32 +513,6 @@ void flush_page_to_ram(unsigned long mfn
         invalidate_icache();
 }
 
-void __init arch_init_memory(void)
-{
-    /*
-     * Initialise our DOMID_XEN domain.
-     * Any Xen-heap pages that we will allow to be mapped will have
-     * their domain field set to dom_xen.
-     */
-    dom_xen = domain_create(DOMID_XEN, NULL, false);
-    BUG_ON(IS_ERR(dom_xen));
-
-    /*
-     * Initialise our DOMID_IO domain.
-     * This domain owns I/O pages that are within the range of the page_info
-     * array. Mappings occur at the priv of the caller.
-     */
-    dom_io = domain_create(DOMID_IO, NULL, false);
-    BUG_ON(IS_ERR(dom_io));
-
-    /*
-     * Initialise our COW domain.
-     * This domain owns sharable pages.
-     */
-    dom_cow = domain_create(DOMID_COW, NULL, false);
-    BUG_ON(IS_ERR(dom_cow));
-}
-
 static inline lpae_t pte_of_xenaddr(vaddr_t va)
 {
     paddr_t ma = va + phys_offset;
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -850,7 +850,7 @@ void __init start_xen(unsigned long boot
 
     rcu_init();
 
-    arch_init_memory();
+    setup_system_domains();
 
     local_irq_enable();
     local_abort_enable();
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -158,9 +158,6 @@ l1_pgentry_t __section(".bss.page_aligne
 
 paddr_t __read_mostly mem_hotplug;
 
-/* Private domain structs for DOMID_XEN and DOMID_IO. */
-struct domain *dom_xen, *dom_io, *dom_cow;
-
 /* Frame table size in pages. */
 unsigned long max_page;
 unsigned long total_pages;
@@ -281,32 +278,6 @@ void __init arch_init_memory(void)
           _PAGE_DIRTY | _PAGE_AVAIL | _PAGE_AVAIL_HIGH | _PAGE_NX);
 
     /*
-     * Initialise our DOMID_XEN domain.
-     * Any Xen-heap pages that we will allow to be mapped will have
-     * their domain field set to dom_xen.
-     * Hidden PCI devices will also be associated with this domain
-     * (but be [partly] controlled by Dom0 nevertheless).
-     */
-    dom_xen = domain_create(DOMID_XEN, NULL, false);
-    BUG_ON(IS_ERR(dom_xen));
-    INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
-
-    /*
-     * Initialise our DOMID_IO domain.
-     * This domain owns I/O pages that are within the range of the page_info
-     * array. Mappings occur at the priv of the caller.
-     */
-    dom_io = domain_create(DOMID_IO, NULL, false);
-    BUG_ON(IS_ERR(dom_io));
-
-    /*
-     * Initialise our COW domain.
-     * This domain owns sharable pages.
-     */
-    dom_cow = domain_create(DOMID_COW, NULL, false);
-    BUG_ON(IS_ERR(dom_cow));
-
-    /*
      * First 1MB of RAM is historically marked as I/O.
      * Note that apart from IO Xen also uses the low 1MB to store the AP boot
      * trampoline and boot information metadata. Due to this always special
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1538,6 +1538,8 @@ void __init noreturn __start_xen(unsigne
     mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
                                   RANGESETF_prettyprint_hex);
 
+    setup_system_domains();
+
     acpi_boot_init();
 
     if ( smp_found_config )
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -72,6 +72,11 @@ domid_t hardware_domid __read_mostly;
 integer_param("hardware_dom", hardware_domid);
 #endif
 
+/* Private domain structs for DOMID_XEN, DOMID_IO, etc. */
+struct domain *__read_mostly dom_xen;
+struct domain *__read_mostly dom_io;
+struct domain *__read_mostly dom_cow;
+
 struct vcpu *idle_vcpu[NR_CPUS] __read_mostly;
 
 vcpu_info_t dummy_vcpu_info;
@@ -518,6 +523,39 @@ struct domain *domain_create(domid_t dom
     return ERR_PTR(err);
 }
 
+void __init setup_system_domains(void)
+{
+    /*
+     * Initialise our DOMID_XEN domain.
+     * Any Xen-heap pages that we will allow to be mapped will have
+     * their domain field set to dom_xen.
+     * Hidden PCI devices will also be associated with this domain
+     * (but be [partly] controlled by Dom0 nevertheless).
+     */
+    dom_xen = domain_create(DOMID_XEN, NULL, false);
+    if ( IS_ERR(dom_xen) )
+        panic("Failed to create d[XEN]: %ld\n", PTR_ERR(dom_xen));
+#ifdef CONFIG_HAS_PCI
+    INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
+#endif
+
+    /*
+     * Initialise our DOMID_IO domain.
+     * This domain owns I/O pages that are within the range of the page_info
+     * array. Mappings occur at the priv of the caller.
+     */
+    dom_io = domain_create(DOMID_IO, NULL, false);
+    if ( IS_ERR(dom_io) )
+        panic("Failed to create d[IO]: %ld\n", PTR_ERR(dom_io));
+
+    /*
+     * Initialise our COW domain.
+     * This domain owns sharable pages.
+     */
+    dom_cow = domain_create(DOMID_COW, NULL, false);
+    if ( IS_ERR(dom_cow) )
+        panic("Failed to create d[COW]: %ld\n", PTR_ERR(dom_cow));
+}
 
 void domain_update_node_affinity(struct domain *d)
 {
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -334,8 +334,6 @@ long arch_memory_op(int op, XEN_GUEST_HA
 
 unsigned long domain_get_maximum_gpfn(struct domain *d);
 
-extern struct domain *dom_xen, *dom_io, *dom_cow;
-
 #define memguard_guard_stack(_p)       ((void)0)
 #define memguard_guard_range(_p,_l)    ((void)0)
 #define memguard_unguard_range(_p,_l)  ((void)0)
--- a/xen/include/asm-arm/setup.h
+++ b/xen/include/asm-arm/setup.h
@@ -77,8 +77,6 @@ extern struct bootinfo bootinfo;
 
 extern domid_t max_init_domid;
 
-void arch_init_memory(void);
-
 void copy_from_paddr(void *dst, paddr_t paddr, unsigned long len);
 
 size_t estimate_efi_size(int mem_nr_banks);
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -597,8 +597,6 @@ unsigned int domain_clamp_alloc_bitsize(
 
 unsigned long domain_get_maximum_gpfn(struct domain *d);
 
-extern struct domain *dom_xen, *dom_io, *dom_cow;	/* for vmcoreinfo */
-
 /* Definition of an mm lock: spinlock with extra fields for debugging */
 typedef struct mm_lock {
     spinlock_t         lock;
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -22,6 +22,8 @@ struct vcpu *alloc_dom0_vcpu0(struct dom
 int vcpu_reset(struct vcpu *);
 int vcpu_up(struct vcpu *v);
 
+void setup_system_domains(void);
+
 struct xen_domctl_getdomaininfo;
 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
 void arch_get_domain_info(const struct domain *d,
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -276,6 +276,9 @@ struct npfec {
 #define MAX_ORDER 20 /* 2^20 contiguous pages */
 #endif
 
+/* Private domain structs for DOMID_XEN, DOMID_IO, etc. */
+extern struct domain *dom_xen, *dom_io, *dom_cow;
+
 #define page_list_entry list_head
 
 #include 
++++++ 5d0cf4e4-AMD-IOMMU-initialize-IRQ-tasklet-once.patch ++++++

References: bsc#1135799

# Commit 587c39e552fbbdfa4ca64c703ad9df7191a76e8e
# Date 2019-06-21 17:16:52 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: initialize IRQ tasklet only once

Don't do this once per IOMMU, nor after setting up the IOMMU interrupt
(which will want to schedule this tasklet). In fact it can be
initialized at build time.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -31,7 +31,8 @@
 
 static int __initdata nr_amd_iommus;
 
-static struct tasklet amd_iommu_irq_tasklet;
+static void do_amd_iommu_irq(unsigned long data);
+static DECLARE_SOFTIRQ_TASKLET(amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
 
 unsigned int __read_mostly ivrs_bdf_entries;
 u8 __read_mostly ivhd_type;
@@ -1056,8 +1057,6 @@ static int __init amd_iommu_init_one(str
     printk("AMD-Vi: IOMMU %d Enabled.\n", nr_amd_iommus );
     nr_amd_iommus++;
 
-    softirq_tasklet_init(&amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
-
     return 0;
 
 error_out:
++++++ 5d149bb0-AMD-IOMMU-dont-add-IOMMUs.patch ++++++

References: bsc#1135799

# Commit 757122c0cf35281618e80cdab37f4f44e5e5ff55
# Date 2019-06-27 12:34:24 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: don't "add" IOMMUs

For find_iommu_for_device() to consistently (independent of ACPI tables)
return NULL for the PCI devices corresponding to IOMMUs, make sure
IOMMUs don't get mapped to themselves by ivrs_mappings[].

While amd_iommu_add_device() won't be called for IOMMUs from
pci_add_device(), as IOMMUs have got marked r/o,
_setup_hwdom_pci_devices() calls there nevertheless. Avoid issuing the
bogus debugging only "No iommu for ...; cannot be handed to ..." log
message as well as the non-debugging "setup ... for ... failed (-19)"
one.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -81,8 +81,8 @@ static void __init add_ivrs_mapping_entr
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
          }
     }
-    /* assgin iommu hardware */
-    ivrs_mappings[bdf].iommu = iommu;
+    /* Assign IOMMU hardware, but don't map an IOMMU by itself. */
+    ivrs_mappings[bdf].iommu = iommu->bdf != bdf ? iommu : NULL;
 }
 
 static struct amd_iommu * __init find_iommu_from_bdf_cap(
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -43,7 +43,7 @@ struct amd_iommu *find_iommu_for_device(
     {
         unsigned int bd0 = bdf & ~PCI_FUNC(~0);
 
-        if ( ivrs_mappings[bd0].iommu )
+        if ( ivrs_mappings[bd0].iommu && ivrs_mappings[bd0].iommu->bdf != bdf )
         {
             struct ivrs_mappings tmp = ivrs_mappings[bd0];
 
@@ -449,6 +449,11 @@ static int amd_iommu_add_device(u8 devfn
         return -EINVAL;
 
     bdf = PCI_BDF2(pdev->bus, pdev->devfn);
+
+    for_each_amd_iommu(iommu)
+        if ( pdev->seg == iommu->seg && bdf == iommu->bdf )
+            return is_hardware_domain(pdev->domain) ? 0 : -ENODEV;
+
     iommu = find_iommu_for_device(pdev->seg, bdf);
     if ( unlikely(!iommu) )
     {
++++++ 5d1b3fab-AMD-IOMMU-restrict-feature-logging.patch ++++++

References: bsc#1135799

# Commit 93ef224d63f9f04a0897d64981c619eb4816c0d3
# Date 2019-07-02 13:27:39 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: restrict feature logging

The common case is all IOMMUs having the same features. Log them only
for the first IOMMU, or for any that have a differing feature set.

Requested-by: Andrew Cooper 
Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_detect.c
+++ b/xen/drivers/passthrough/amd/iommu_detect.c
@@ -62,6 +62,7 @@ void __init get_iommu_features(struct am
 {
     u32 low, high;
     int i = 0 ;
+    const struct amd_iommu *first;
     static const char *__initdata feature_str[] = {
         "- Prefetch Pages Command", 
         "- Peripheral Page Service Request", 
@@ -89,6 +90,11 @@ void __init get_iommu_features(struct am
 
     iommu->features = ((u64)high << 32) | low;
 
+    /* Don't log the same set of features over and over. */
+    first = list_first_entry(&amd_iommu_head, struct amd_iommu, list);
+    if ( iommu != first && iommu->features == first->features )
+        return;
+
     printk("AMD-Vi: IOMMU Extended Features:\n");
 
     while ( feature_str[i] )
++++++ 5d358508-x86-IRQ-desc-affinity-represents-request.patch ++++++

References: bsc#1135799

# Commit 4a9dfab767e2f5d1b7b919b07099f3fc87a67fb6
# Date 2019-07-22 11:42:32 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IRQ: desc->affinity should strictly represent the requested value

desc->arch.cpu_mask reflects the actual set of target CPUs. Don't ever
fiddle with desc->affinity itself, except to store caller requested
values. Note that assign_irq_vector() now takes a NULL incoming CPU mask
to mean "all CPUs" now, rather than just "all currently online CPUs".
This way no further affinity adjustment is needed after onlining further
CPUs.

This renders both set_native_irq_info() uses (which weren't using proper
locking anyway) redundant - drop the function altogether.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Acked-by: Andrew Cooper 

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1041,7 +1041,6 @@ static void __init setup_IO_APIC_irqs(vo
             SET_DEST(entry, logical, cpu_mask_to_apicid(TARGET_CPUS));
             spin_lock_irqsave(&ioapic_lock, flags);
             __ioapic_write_entry(apic, pin, 0, entry);
-            set_native_irq_info(irq, TARGET_CPUS);
             spin_unlock_irqrestore(&ioapic_lock, flags);
         }
     }
@@ -2252,7 +2251,6 @@ int io_apic_set_pci_routing (int ioapic,
 
     spin_lock_irqsave(&ioapic_lock, flags);
     __ioapic_write_entry(ioapic, pin, 0, entry);
-    set_native_irq_info(irq, TARGET_CPUS);
     spin_unlock(&ioapic_lock);
 
     spin_lock(&desc->lock);
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -559,11 +559,16 @@ int assign_irq_vector(int irq, const cpu
 
     spin_lock_irqsave(&vector_lock, flags);
     ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
-    if (!ret) {
+    if ( !ret )
+    {
         ret = desc->arch.vector;
-        cpumask_copy(desc->affinity, desc->arch.cpu_mask);
+        if ( mask )
+            cpumask_copy(desc->affinity, mask);
+        else
+            cpumask_setall(desc->affinity);
     }
     spin_unlock_irqrestore(&vector_lock, flags);
+
     return ret;
 }
 
@@ -2306,9 +2311,10 @@ static void dump_irqs(unsigned char key)
 
         spin_lock_irqsave(&desc->lock, flags);
 
-        printk("   IRQ:%4d affinity:%*pb vec:%02x type=%-15s status=%08x ",
-               irq, nr_cpu_ids, cpumask_bits(desc->affinity), desc->arch.vector,
-               desc->handler->typename, desc->status);
+        printk("   IRQ:%4d vec:%02x %-15s status=%03x aff:{%*pbl}/{%*pbl} ",
+               irq, desc->arch.vector, desc->handler->typename, desc->status,
+               nr_cpu_ids, cpumask_bits(desc->affinity),
+               nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask));
 
         if ( ssid )
             printk("Z=%-25s ", ssid);
@@ -2388,19 +2394,19 @@ void fixup_irqs(const cpumask_t *mask, b
              vector <= LAST_HIPRIORITY_VECTOR )
             cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
 
-        cpumask_copy(&affinity, desc->affinity);
-        if ( !desc->action || cpumask_subset(&affinity, mask) )
+        if ( !desc->action || cpumask_subset(desc->affinity, mask) )
         {
             spin_unlock(&desc->lock);
             continue;
         }
 
-        cpumask_and(&affinity, &affinity, mask);
-        if ( cpumask_empty(&affinity) )
+        if ( !cpumask_intersects(mask, desc->affinity) )
         {
             break_affinity = true;
-            cpumask_copy(&affinity, mask);
+            cpumask_setall(&affinity);
         }
+        else
+            cpumask_copy(&affinity, desc->affinity);
 
         if ( desc->handler->disable )
             desc->handler->disable(desc);
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -162,11 +162,6 @@ extern irq_desc_t *domain_spin_lock_irq_
 extern irq_desc_t *pirq_spin_lock_irq_desc(
     const struct pirq *, unsigned long *pflags);
 
-static inline void set_native_irq_info(unsigned int irq, const cpumask_t *mask)
-{
-    cpumask_copy(irq_to_desc(irq)->affinity, mask);
-}
-
 unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *);
 
 #ifndef arch_hwdom_irqs
++++++ 5d358534-x86-IRQ-consolidate-arch-cpu_mask-use.patch ++++++

References: bsc#1135799

# Commit 481a478b3af4b1b33f9a121192a743f17a901457
# Date 2019-07-22 11:43:16 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IRQ: consolidate use of ->arch.cpu_mask

Mixed meaning was implied so far by different pieces of code -
disagreement was in particular about whether to expect offline CPUs'
bits to possibly be set. Switch to a mostly consistent meaning
(exception being high priority interrupts, which would perhaps better
be switched to the same model as well in due course). Use the field to
record the vector allocation mask, i.e. potentially including bits of
offline (parked) CPUs. This implies that before passing the mask to
certain functions (most notably cpu_mask_to_apicid()) it needs to be
further reduced to the online subset.

The exception of high priority interrupts is also why for the moment
_bind_irq_vector() is left as is, despite looking wrong: It's used
exclusively for IRQ0, which isn't supposed to move off CPU0 at any time.

The prior lack of restricting to online CPUs in set_desc_affinity()
before calling cpu_mask_to_apicid() in particular allowed (in x2APIC
clustered mode) offlined CPUs to end up enabled in an IRQ's destination
field. (I wonder whether vector_allocation_cpumask_flat() shouldn't
follow a similar model, using cpu_present_map in favor of
cpu_online_map.)

For IO-APIC code it was definitely wrong to potentially store, as a
fallback, TARGET_CPUS (i.e. all online ones) into the field, as that
would have caused problems when determining on which CPUs to release
vectors when they've gone out of use. Disable interrupts instead when
no valid target CPU can be established (which code elsewhere should
guarantee to never happen), and log a message in such an unlikely event.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Acked-by: Andrew Cooper 

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -679,7 +679,7 @@ void /*__init*/ setup_ioapic_dest(void)
                 continue;
             irq = pin_2_irq(irq_entry, ioapic, pin);
             desc = irq_to_desc(irq);
-            BUG_ON(cpumask_empty(desc->arch.cpu_mask));
+            BUG_ON(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
             set_ioapic_affinity_irq(desc, desc->arch.cpu_mask);
         }
 
@@ -2198,7 +2198,6 @@ int io_apic_set_pci_routing (int ioapic,
 {
     struct irq_desc *desc = irq_to_desc(irq);
     struct IO_APIC_route_entry entry;
-    cpumask_t mask;
     unsigned long flags;
     int vector;
 
@@ -2233,11 +2232,17 @@ int io_apic_set_pci_routing (int ioapic,
         return vector;
     entry.vector = vector;
 
-    cpumask_copy(&mask, TARGET_CPUS);
-    /* Don't chance ending up with an empty mask. */
-    if (cpumask_intersects(&mask, desc->arch.cpu_mask))
-        cpumask_and(&mask, &mask, desc->arch.cpu_mask);
-    SET_DEST(entry, logical, cpu_mask_to_apicid(&mask));
+    if (cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS)) {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(entry, logical, cpu_mask_to_apicid(mask));
+    } else {
+        printk(XENLOG_ERR "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+    }
 
     apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
@@ -2423,7 +2428,21 @@ int ioapic_guest_write(unsigned long phy
     /* Set the vector field to the real vector! */
     rte.vector = desc->arch.vector;
 
-    SET_DEST(rte, logical, cpu_mask_to_apicid(desc->arch.cpu_mask));
+    if ( cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS) )
+    {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(rte, logical, cpu_mask_to_apicid(mask));
+    }
+    else
+    {
+        gprintk(XENLOG_ERR, "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+        rte.mask = 1;
+    }
 
     __ioapic_write_entry(apic, pin, 0, rte);
     
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -457,11 +457,13 @@ static int __assign_irq_vector(
      */
     static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0;
     int cpu, err, old_vector;
-    cpumask_t tmp_mask;
     vmask_t *irq_used_vectors = NULL;
 
     old_vector = irq_to_vector(irq);
-    if (old_vector > 0) {
+    if ( old_vector > 0 )
+    {
+        cpumask_t tmp_mask;
+
         cpumask_and(&tmp_mask, mask, &cpu_online_map);
         if (cpumask_intersects(&tmp_mask, desc->arch.cpu_mask)) {
             desc->arch.vector = old_vector;
@@ -484,7 +486,9 @@ static int __assign_irq_vector(
     else
         irq_used_vectors = irq_get_used_vector_mask(irq);
 
-    for_each_cpu(cpu, mask) {
+    for_each_cpu(cpu, mask)
+    {
+        const cpumask_t *vec_mask;
         int new_cpu;
         int vector, offset;
 
@@ -492,8 +496,7 @@ static int __assign_irq_vector(
         if (!cpu_online(cpu))
             continue;
 
-        cpumask_and(&tmp_mask, vector_allocation_cpumask(cpu),
-                    &cpu_online_map);
+        vec_mask = vector_allocation_cpumask(cpu);
 
         vector = current_vector;
         offset = current_offset;
@@ -514,7 +517,7 @@ next:
             && test_bit(vector, irq_used_vectors) )
             goto next;
 
-        for_each_cpu(new_cpu, &tmp_mask)
+        for_each_cpu(new_cpu, vec_mask)
             if (per_cpu(vector_irq, new_cpu)[vector] >= 0)
                 goto next;
         /* Found one! */
@@ -525,11 +528,11 @@ next:
             cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask);
             desc->arch.old_vector = desc->arch.vector;
         }
-        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
-        for_each_cpu(new_cpu, &tmp_mask)
+        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, vec_mask);
+        for_each_cpu(new_cpu, vec_mask)
             per_cpu(vector_irq, new_cpu)[vector] = irq;
         desc->arch.vector = vector;
-        cpumask_copy(desc->arch.cpu_mask, &tmp_mask);
+        cpumask_copy(desc->arch.cpu_mask, vec_mask);
 
         desc->arch.used = IRQ_USED;
         ASSERT((desc->arch.used_vectors == NULL)
@@ -761,6 +764,7 @@ unsigned int set_desc_affinity(struct ir
 
     cpumask_copy(desc->affinity, mask);
     cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
 
     return cpu_mask_to_apicid(&dest_mask);
 }
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -32,6 +32,12 @@ struct irq_desc;
 struct arch_irq_desc {
         s16 vector;                  /* vector itself is only 8 bits, */
         s16 old_vector;              /* but we use -1 for unassigned  */
+        /*
+         * Except for high priority interrupts @cpu_mask may have bits set for
+         * offline CPUs.  Consumers need to be careful to mask this down to
+         * online ones as necessary.  There is supposed to always be a non-
+         * empty intersection with cpu_online_map.
+         */
         cpumask_var_t cpu_mask;
         cpumask_var_t old_cpu_mask;
         cpumask_var_t pending_mask;
++++++ 5d358a67-AMD-IOMMU-pass-IOMMU-to-iterate_ivrs_entries-cb.patch ++++++

References: bsc#1135799

# Commit 4e8e9875622cbdb24469c43ca5f08f83dc59bbca
# Date 2019-07-22 12:05:27 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: pass IOMMU to iterate_ivrs_entries() callback

Both users will want to know IOMMU properties (specifically the IRTE
size) subsequently. Leverage this to avoid pointless calls to the
callback when IVRS mapping table entries are unpopulated. To avoid
leaking interrupt remapping tables (bogusly) allocated for IOMMUs
themselves, this requires suppressing their allocation in the first
place, taking a step further what commit 757122c0cf ('AMD/IOMMU: don't
"add" IOMMUs') had done.

Additionally suppress the call for alias entries, as again both users
don't care about these anyway. In fact this eliminates a fair bit of
redundancy from dump output.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -65,7 +65,11 @@ static void __init add_ivrs_mapping_entr
     /* override flags for range of devices */
     ivrs_mappings[bdf].device_flags = flags;
 
-    if (ivrs_mappings[alias_id].intremap_table == NULL )
+    /* Don't map an IOMMU by itself. */
+    if ( iommu->bdf == bdf )
+        return;
+
+    if ( !ivrs_mappings[alias_id].intremap_table )
     {
          /* allocate per-device interrupt remapping table */
          if ( amd_iommu_perdev_intremap )
@@ -81,8 +85,9 @@ static void __init add_ivrs_mapping_entr
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
          }
     }
-    /* Assign IOMMU hardware, but don't map an IOMMU by itself. */
-    ivrs_mappings[bdf].iommu = iommu->bdf != bdf ? iommu : NULL;
+
+    /* Assign IOMMU hardware. */
+    ivrs_mappings[bdf].iommu = iommu;
 }
 
 static struct amd_iommu * __init find_iommu_from_bdf_cap(
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1124,7 +1124,8 @@ int iterate_ivrs_mappings(int (*handler)
     return rc;
 }
 
-int iterate_ivrs_entries(int (*handler)(u16 seg, struct ivrs_mappings *))
+int iterate_ivrs_entries(int (*handler)(const struct amd_iommu *,
+                                        struct ivrs_mappings *))
 {
     u16 seg = 0;
     int rc = 0;
@@ -1137,7 +1138,12 @@ int iterate_ivrs_entries(int (*handler)(
             break;
         seg = IVRS_MAPPINGS_SEG(map);
         for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; ++bdf )
-            rc = handler(seg, map + bdf);
+        {
+            const struct amd_iommu *iommu = map[bdf].iommu;
+
+            if ( iommu && map[bdf].dte_requestor_id == bdf )
+                rc = handler(iommu, &map[bdf]);
+        }
     } while ( !rc && ++seg );
 
     return rc;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -607,7 +607,7 @@ void amd_iommu_read_msi_from_ire(
 }
 
 int __init amd_iommu_free_intremap_table(
-    u16 seg, struct ivrs_mappings *ivrs_mapping)
+    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
     void *tb = ivrs_mapping->intremap_table;
 
@@ -683,14 +683,15 @@ static void dump_intremap_table(const u3
     }
 }
 
-static int dump_intremap_mapping(u16 seg, struct ivrs_mappings *ivrs_mapping)
+static int dump_intremap_mapping(const struct amd_iommu *iommu,
+                                 struct ivrs_mappings *ivrs_mapping)
 {
     unsigned long flags;
 
     if ( !ivrs_mapping )
         return 0;
 
-    printk("  %04x:%02x:%02x:%u:\n", seg,
+    printk("  %04x:%02x:%02x:%u:\n", iommu->seg,
            PCI_BUS(ivrs_mapping->dte_requestor_id),
            PCI_SLOT(ivrs_mapping->dte_requestor_id),
            PCI_FUNC(ivrs_mapping->dte_requestor_id));
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -132,7 +132,8 @@ extern u8 ivhd_type;
 
 struct ivrs_mappings *get_ivrs_mappings(u16 seg);
 int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *));
-int iterate_ivrs_entries(int (*)(u16 seg, struct ivrs_mappings *));
+int iterate_ivrs_entries(int (*)(const struct amd_iommu *,
+                                 struct ivrs_mappings *));
 
 /* iommu tables in guest space */
 struct mmio_reg {
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -97,7 +97,8 @@ struct amd_iommu *find_iommu_for_device(
 /* interrupt remapping */
 int amd_iommu_setup_ioapic_remapping(void);
 void *amd_iommu_alloc_intremap_table(unsigned long **);
-int amd_iommu_free_intremap_table(u16 seg, struct ivrs_mappings *);
+int amd_iommu_free_intremap_table(
+    const struct amd_iommu *, struct ivrs_mappings *);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
++++++ 5d358a92-AMD-IOMMU-pass-IOMMU-to-amd_iommu_alloc_intremap_table.patch ++++++

References: bsc#1135799

# Commit 01dc6aa94246a2e4519726552b06ac1fe4e6d31a
# Date 2019-07-22 12:06:10 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: pass IOMMU to amd_iommu_alloc_intremap_table()

The function will want to know IOMMU properties (specifically the IRTE
size) subsequently.

Correct indentation of one of the call sites at this occasion.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -74,12 +74,14 @@ static void __init add_ivrs_mapping_entr
          /* allocate per-device interrupt remapping table */
          if ( amd_iommu_perdev_intremap )
              ivrs_mappings[alias_id].intremap_table =
-                amd_iommu_alloc_intremap_table(
-                    &ivrs_mappings[alias_id].intremap_inuse);
+                 amd_iommu_alloc_intremap_table(
+                     iommu,
+                     &ivrs_mappings[alias_id].intremap_inuse);
          else
          {
              if ( shared_intremap_table == NULL  )
                  shared_intremap_table = amd_iommu_alloc_intremap_table(
+                     iommu,
                      &shared_intremap_inuse);
              ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -622,7 +622,8 @@ int __init amd_iommu_free_intremap_table
     return 0;
 }
 
-void* __init amd_iommu_alloc_intremap_table(unsigned long **inuse_map)
+void *__init amd_iommu_alloc_intremap_table(
+    const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
     void *tb;
     tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -96,7 +96,8 @@ struct amd_iommu *find_iommu_for_device(
 
 /* interrupt remapping */
 int amd_iommu_setup_ioapic_remapping(void);
-void *amd_iommu_alloc_intremap_table(unsigned long **);
+void *amd_iommu_alloc_intremap_table(
+    const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *);
 void amd_iommu_ioapic_update_ire(
++++++ 5d39811c-x86-IOMMU-dont-restrict-IRQ-affinities.patch ++++++

References: bsc#1135799

# Commit 05f41944a05cc89652b6ceb7a08ecb22468d9188
# Date 2019-07-25 12:14:52 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/IOMMU: don't restrict IRQ affinities to online CPUs

In line with "x86/IRQ: desc->affinity should strictly represent the
requested value" the internally used IRQ(s) also shouldn't be restricted
to online ones. Make set_desc_affinity() (set_msi_affinity() then does
by implication) cope with a NULL mask being passed (just like
assign_irq_vector() does), and have IOMMU code pass NULL instead of
&cpu_online_map (when, for VT-d, there's no NUMA node information
available).

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Acked-by: Andrew Cooper 
Reviewed-by: Kevin Tian 
Acked-by: Brian Woods 

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -750,20 +750,28 @@ unsigned int set_desc_affinity(struct ir
     unsigned long flags;
     cpumask_t dest_mask;
 
-    if (!cpumask_intersects(mask, &cpu_online_map))
+    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
         return BAD_APICID;
 
     irq = desc->irq;
 
     spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, mask);
+    ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
     spin_unlock_irqrestore(&vector_lock, flags);
 
-    if (ret < 0)
+    if ( ret < 0 )
         return BAD_APICID;
 
-    cpumask_copy(desc->affinity, mask);
-    cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    if ( mask )
+    {
+        cpumask_copy(desc->affinity, mask);
+        cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    }
+    else
+    {
+        cpumask_setall(desc->affinity);
+        cpumask_copy(&dest_mask, desc->arch.cpu_mask);
+    }
     cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
 
     return cpu_mask_to_apicid(&dest_mask);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -888,7 +888,7 @@ static void enable_iommu(struct amd_iomm
 
     desc = irq_to_desc(iommu->msi.irq);
     spin_lock(&desc->lock);
-    set_msi_affinity(desc, &cpu_online_map);
+    set_msi_affinity(desc, NULL);
     spin_unlock(&desc->lock);
 
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2133,10 +2133,10 @@ static void adjust_irq_affinity(struct a
     const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
     unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
                              : NUMA_NO_NODE;
-    const cpumask_t *cpumask = &cpu_online_map;
+    const cpumask_t *cpumask = NULL;
 
     if ( node < MAX_NUMNODES && node_online(node) &&
-         cpumask_intersects(&node_to_cpumask(node), cpumask) )
+         cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
         cpumask = &node_to_cpumask(node);
     dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
 }
++++++ 5d417813-AMD-IOMMU-bitfield-extended-features.patch ++++++

References: bsc#1135799

# Commit c69363b2ac7e5ed88908a304e6903f5842c9805e
# Date 2019-07-31 13:14:27 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: use bit field for extended feature register

This also takes care of several of the shift values wrongly having been
specified as hex rather than dec.

Take the opportunity and
- replace a readl() pair by a single readq(),
- add further fields.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_detect.c
+++ b/xen/drivers/passthrough/amd/iommu_detect.c
@@ -60,49 +60,76 @@ static int __init get_iommu_capabilities
 
 void __init get_iommu_features(struct amd_iommu *iommu)
 {
-    u32 low, high;
-    int i = 0 ;
     const struct amd_iommu *first;
-    static const char *__initdata feature_str[] = {
-        "- Prefetch Pages Command", 
-        "- Peripheral Page Service Request", 
-        "- X2APIC Supported", 
-        "- NX bit Supported", 
-        "- Guest Translation", 
-        "- Reserved bit [5]",
-        "- Invalidate All Command", 
-        "- Guest APIC supported", 
-        "- Hardware Error Registers", 
-        "- Performance Counters", 
-        NULL
-    };
-
     ASSERT( iommu->mmio_base );
 
     if ( !iommu_has_cap(iommu, PCI_CAP_EFRSUP_SHIFT) )
     {
-        iommu->features = 0;
+        iommu->features.raw = 0;
         return;
     }
 
-    low = readl(iommu->mmio_base + IOMMU_EXT_FEATURE_MMIO_OFFSET);
-    high = readl(iommu->mmio_base + IOMMU_EXT_FEATURE_MMIO_OFFSET + 4);
-
-    iommu->features = ((u64)high << 32) | low;
+    iommu->features.raw =
+        readq(iommu->mmio_base + IOMMU_EXT_FEATURE_MMIO_OFFSET);
 
     /* Don't log the same set of features over and over. */
     first = list_first_entry(&amd_iommu_head, struct amd_iommu, list);
-    if ( iommu != first && iommu->features == first->features )
+    if ( iommu != first && iommu->features.raw == first->features.raw )
         return;
 
     printk("AMD-Vi: IOMMU Extended Features:\n");
 
-    while ( feature_str[i] )
+#define FEAT(fld, str) do {                                    \
+    if ( --((union amd_iommu_ext_features){}).flds.fld > 1 )   \
+        printk( "- " str ": %#x\n", iommu->features.flds.fld); \
+    else if ( iommu->features.flds.fld )                       \
+        printk( "- " str "\n");                                \
+} while ( false )
+
+    FEAT(pref_sup,           "Prefetch Pages Command");
+    FEAT(ppr_sup,            "Peripheral Page Service Request");
+    FEAT(xt_sup,             "x2APIC");
+    FEAT(nx_sup,             "NX bit");
+    FEAT(gappi_sup,          "Guest APIC Physical Processor Interrupt");
+    FEAT(ia_sup,             "Invalidate All Command");
+    FEAT(ga_sup,             "Guest APIC");
+    FEAT(he_sup,             "Hardware Error Registers");
+    FEAT(pc_sup,             "Performance Counters");
+    FEAT(hats,               "Host Address Translation Size");
+
+    if ( iommu->features.flds.gt_sup )
     {
-        if ( amd_iommu_has_feature(iommu, i) )
-            printk( " %s\n", feature_str[i]);
-        i++;
+        FEAT(gats,           "Guest Address Translation Size");
+        FEAT(glx_sup,        "Guest CR3 Root Table Level");
+        FEAT(pas_max,        "Maximum PASID");
     }
+
+    FEAT(smif_sup,           "SMI Filter Register");
+    FEAT(smif_rc,            "SMI Filter Register Count");
+    FEAT(gam_sup,            "Guest Virtual APIC Modes");
+    FEAT(dual_ppr_log_sup,   "Dual PPR Log");
+    FEAT(dual_event_log_sup, "Dual Event Log");
+    FEAT(sats_sup,           "Secure ATS");
+    FEAT(us_sup,             "User / Supervisor Page Protection");
+    FEAT(dev_tbl_seg_sup,    "Device Table Segmentation");
+    FEAT(ppr_early_of_sup,   "PPR Log Overflow Early Warning");
+    FEAT(ppr_auto_rsp_sup,   "PPR Automatic Response");
+    FEAT(marc_sup,           "Memory Access Routing and Control");
+    FEAT(blk_stop_mrk_sup,   "Block StopMark Message");
+    FEAT(perf_opt_sup ,      "Performance Optimization");
+    FEAT(msi_cap_mmio_sup,   "MSI Capability MMIO Access");
+    FEAT(gio_sup,            "Guest I/O Protection");
+    FEAT(ha_sup,             "Host Access");
+    FEAT(eph_sup,            "Enhanced PPR Handling");
+    FEAT(attr_fw_sup,        "Attribute Forward");
+    FEAT(hd_sup,             "Host Dirty");
+    FEAT(inv_iotlb_type_sup, "Invalidate IOTLB Type");
+    FEAT(viommu_sup,         "Virtualized IOMMU");
+    FEAT(vm_guard_io_sup,    "VMGuard I/O Support");
+    FEAT(vm_table_size,      "VM Table Size");
+    FEAT(ga_update_dis_sup,  "Guest Access Bit Update Disable");
+
+#undef FEAT
 }
 
 int __init amd_iommu_detect_one_acpi(
--- a/xen/drivers/passthrough/amd/iommu_guest.c
+++ b/xen/drivers/passthrough/amd/iommu_guest.c
@@ -667,7 +667,7 @@ static uint64_t iommu_mmio_read64(struct
         val = reg_to_u64(iommu->reg_status);
         break;
     case IOMMU_EXT_FEATURE_MMIO_OFFSET:
-        val = reg_to_u64(iommu->reg_ext_feature);
+        val = iommu->reg_ext_feature.raw;
         break;
 
     default:
@@ -831,39 +831,26 @@ int guest_iommu_set_base(struct domain *
 /* Initialize mmio read only bits */
 static void guest_iommu_reg_init(struct guest_iommu *iommu)
 {
-    uint32_t lower, upper;
+    union amd_iommu_ext_features ef = {
+        /* Support prefetch */
+        .flds.pref_sup = 1,
+        /* Support PPR log */
+        .flds.ppr_sup = 1,
+        /* Support guest translation */
+        .flds.gt_sup = 1,
+        /* Support invalidate all command */
+        .flds.ia_sup = 1,
+        /* Host translation size has 6 levels */
+        .flds.hats = HOST_ADDRESS_SIZE_6_LEVEL,
+        /* Guest translation size has 6 levels */
+        .flds.gats = GUEST_ADDRESS_SIZE_6_LEVEL,
+        /* Single level gCR3 */
+        .flds.glx_sup = GUEST_CR3_1_LEVEL,
+        /* 9 bit PASID */
+        .flds.pas_max = PASMAX_9_bit,
+    };
 
-    lower = upper = 0;
-    /* Support prefetch */
-    iommu_set_bit(&lower,IOMMU_EXT_FEATURE_PREFSUP_SHIFT);
-    /* Support PPR log */
-    iommu_set_bit(&lower,IOMMU_EXT_FEATURE_PPRSUP_SHIFT);
-    /* Support guest translation */
-    iommu_set_bit(&lower,IOMMU_EXT_FEATURE_GTSUP_SHIFT);
-    /* Support invalidate all command */
-    iommu_set_bit(&lower,IOMMU_EXT_FEATURE_IASUP_SHIFT);
-
-    /* Host translation size has 6 levels */
-    set_field_in_reg_u32(HOST_ADDRESS_SIZE_6_LEVEL, lower,
-                         IOMMU_EXT_FEATURE_HATS_MASK,
-                         IOMMU_EXT_FEATURE_HATS_SHIFT,
-                         &lower);
-    /* Guest translation size has 6 levels */
-    set_field_in_reg_u32(GUEST_ADDRESS_SIZE_6_LEVEL, lower,
-                         IOMMU_EXT_FEATURE_GATS_MASK,
-                         IOMMU_EXT_FEATURE_GATS_SHIFT,
-                         &lower);
-    /* Single level gCR3 */
-    set_field_in_reg_u32(GUEST_CR3_1_LEVEL, lower,
-                         IOMMU_EXT_FEATURE_GLXSUP_MASK,
-                         IOMMU_EXT_FEATURE_GLXSUP_SHIFT, &lower);
-    /* 9 bit PASID */
-    set_field_in_reg_u32(PASMAX_9_bit, upper,
-                         IOMMU_EXT_FEATURE_PASMAX_MASK,
-                         IOMMU_EXT_FEATURE_PASMAX_SHIFT, &upper);
-
-    iommu->reg_ext_feature.lo = lower;
-    iommu->reg_ext_feature.hi = upper;
+    iommu->reg_ext_feature = ef;
 }
 
 static int guest_iommu_mmio_range(struct vcpu *v, unsigned long addr)
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -883,7 +883,7 @@ static void enable_iommu(struct amd_iomm
     register_iommu_event_log_in_mmio_space(iommu);
     register_iommu_exclusion_range(iommu);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) )
+    if ( iommu->features.flds.ppr_sup )
         register_iommu_ppr_log_in_mmio_space(iommu);
 
     desc = irq_to_desc(iommu->msi.irq);
@@ -897,15 +897,15 @@ static void enable_iommu(struct amd_iomm
     set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED);
     set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) )
+    if ( iommu->features.flds.ppr_sup )
         set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_ENABLED);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_GTSUP_SHIFT) )
+    if ( iommu->features.flds.gt_sup )
         set_iommu_guest_translation_control(iommu, IOMMU_CONTROL_ENABLED);
 
     set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_IASUP_SHIFT) )
+    if ( iommu->features.flds.ia_sup )
         amd_iommu_flush_all_caches(iommu);
 
     iommu->enabled = 1;
@@ -928,10 +928,10 @@ static void disable_iommu(struct amd_iom
     set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_DISABLED);
     set_iommu_event_log_control(iommu, IOMMU_CONTROL_DISABLED);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) )
+    if ( iommu->features.flds.ppr_sup )
         set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_DISABLED);
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_GTSUP_SHIFT) )
+    if ( iommu->features.flds.gt_sup )
         set_iommu_guest_translation_control(iommu, IOMMU_CONTROL_DISABLED);
 
     set_iommu_translation_control(iommu, IOMMU_CONTROL_DISABLED);
@@ -1029,7 +1029,7 @@ static int __init amd_iommu_init_one(str
 
     get_iommu_features(iommu);
 
-    if ( iommu->features )
+    if ( iommu->features.raw )
         iommuv2_enabled = 1;
 
     if ( allocate_cmd_buffer(iommu) == NULL )
@@ -1038,9 +1038,8 @@ static int __init amd_iommu_init_one(str
     if ( allocate_event_log(iommu) == NULL )
         goto error_out;
 
-    if ( amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) )
-        if ( allocate_ppr_log(iommu) == NULL )
-            goto error_out;
+    if ( iommu->features.flds.ppr_sup && !allocate_ppr_log(iommu) )
+        goto error_out;
 
     if ( !set_iommu_interrupt_handler(iommu) )
         goto error_out;
@@ -1396,7 +1395,7 @@ void amd_iommu_resume(void)
     }
 
     /* flush all cache entries after iommu re-enabled */
-    if ( !amd_iommu_has_feature(iommu, IOMMU_EXT_FEATURE_IASUP_SHIFT) )
+    if ( !iommu->features.flds.ia_sup )
     {
         invalidate_all_devices();
         invalidate_all_domain_pages();
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -88,7 +88,7 @@ struct amd_iommu {
     iommu_cap_t cap;
 
     u8 ht_flags;
-    u64 features;
+    union amd_iommu_ext_features features;
 
     void *mmio_base;
     unsigned long mmio_base_phys;
@@ -180,7 +180,7 @@ struct guest_iommu {
     /* MMIO regs */
     struct mmio_reg         reg_ctrl;              /* MMIO offset 0018h */
     struct mmio_reg         reg_status;            /* MMIO offset 2020h */
-    struct mmio_reg         reg_ext_feature;       /* MMIO offset 0030h */
+    union amd_iommu_ext_features reg_ext_feature;  /* MMIO offset 0030h */
 
     /* guest interrupt settings */
     struct guest_iommu_msi  msi;
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -362,26 +362,57 @@
 #define IOMMU_EXCLUSION_LIMIT_HIGH_MASK		0xFFFFFFFF
 #define IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT	0
 
-/* Extended Feature Register*/
+/* Extended Feature Register */
 #define IOMMU_EXT_FEATURE_MMIO_OFFSET                   0x30
-#define IOMMU_EXT_FEATURE_PREFSUP_SHIFT                 0x0
-#define IOMMU_EXT_FEATURE_PPRSUP_SHIFT                  0x1
-#define IOMMU_EXT_FEATURE_XTSUP_SHIFT                   0x2
-#define IOMMU_EXT_FEATURE_NXSUP_SHIFT                   0x3
-#define IOMMU_EXT_FEATURE_GTSUP_SHIFT                   0x4
-#define IOMMU_EXT_FEATURE_IASUP_SHIFT                   0x6
-#define IOMMU_EXT_FEATURE_GASUP_SHIFT                   0x7
-#define IOMMU_EXT_FEATURE_HESUP_SHIFT                   0x8
-#define IOMMU_EXT_FEATURE_PCSUP_SHIFT                   0x9
-#define IOMMU_EXT_FEATURE_HATS_SHIFT                    0x10
-#define IOMMU_EXT_FEATURE_HATS_MASK                     0x00000C00
-#define IOMMU_EXT_FEATURE_GATS_SHIFT                    0x12
-#define IOMMU_EXT_FEATURE_GATS_MASK                     0x00003000
-#define IOMMU_EXT_FEATURE_GLXSUP_SHIFT                  0x14
-#define IOMMU_EXT_FEATURE_GLXSUP_MASK                   0x0000C000
 
-#define IOMMU_EXT_FEATURE_PASMAX_SHIFT                  0x0
-#define IOMMU_EXT_FEATURE_PASMAX_MASK                   0x0000001F
+union amd_iommu_ext_features {
+    uint64_t raw;
+    struct {
+        unsigned int pref_sup:1;
+        unsigned int ppr_sup:1;
+        unsigned int xt_sup:1;
+        unsigned int nx_sup:1;
+        unsigned int gt_sup:1;
+        unsigned int gappi_sup:1;
+        unsigned int ia_sup:1;
+        unsigned int ga_sup:1;
+        unsigned int he_sup:1;
+        unsigned int pc_sup:1;
+        unsigned int hats:2;
+        unsigned int gats:2;
+        unsigned int glx_sup:2;
+        unsigned int smif_sup:2;
+        unsigned int smif_rc:3;
+        unsigned int gam_sup:3;
+        unsigned int dual_ppr_log_sup:2;
+        unsigned int :2;
+        unsigned int dual_event_log_sup:2;
+        unsigned int :1;
+        unsigned int sats_sup:1;
+        unsigned int pas_max:5;
+        unsigned int us_sup:1;
+        unsigned int dev_tbl_seg_sup:2;
+        unsigned int ppr_early_of_sup:1;
+        unsigned int ppr_auto_rsp_sup:1;
+        unsigned int marc_sup:2;
+        unsigned int blk_stop_mrk_sup:1;
+        unsigned int perf_opt_sup:1;
+        unsigned int msi_cap_mmio_sup:1;
+        unsigned int :1;
+        unsigned int gio_sup:1;
+        unsigned int ha_sup:1;
+        unsigned int eph_sup:1;
+        unsigned int attr_fw_sup:1;
+        unsigned int hd_sup:1;
+        unsigned int :1;
+        unsigned int inv_iotlb_type_sup:1;
+        unsigned int viommu_sup:1;
+        unsigned int vm_guard_io_sup:1;
+        unsigned int vm_table_size:4;
+        unsigned int ga_update_dis_sup:1;
+        unsigned int :2;
+    } flds;
+};
 
 /* Status Register*/
 #define IOMMU_STATUS_MMIO_OFFSET		0x2020
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -220,13 +220,6 @@ static inline int iommu_has_cap(struct a
     return !!(iommu->cap.header & (1u << bit));
 }
 
-static inline int amd_iommu_has_feature(struct amd_iommu *iommu, uint32_t bit)
-{
-    if ( !iommu_has_cap(iommu, PCI_CAP_EFRSUP_SHIFT) )
-        return 0;
-    return !!(iommu->features & (1U << bit));
-}
-
 /* access tail or head pointer of ring buffer */
 static inline uint32_t iommu_get_rb_pointer(uint32_t reg)
 {
++++++ 5d417838-AMD-IOMMU-bitfield-control-reg.patch ++++++

References: bsc#1135799

# Commit 08344ec71cad07829855fb7927faaafd26189798
# Date 2019-07-31 13:15:04 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: use bit field for control register

Also introduce a field in struct amd_iommu caching the most recently
written control register. All writes should now happen exclusively from
that cached value, such that it is guaranteed to be up to date.

Take the opportunity and add further fields. Also convert a few boolean
function parameters to bool, such that use of !! can be avoided.

Because of there now being definitions beyond bit 31, writel() also gets
replaced by writeq() when updating hardware.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_guest.c
+++ b/xen/drivers/passthrough/amd/iommu_guest.c
@@ -346,7 +346,7 @@ static int do_invalidate_iotlb_pages(str
 
 static int do_completion_wait(struct domain *d, cmd_entry_t *cmd)
 {
-    bool_t com_wait_int_en, com_wait_int, i, s;
+    bool com_wait_int, i, s;
     struct guest_iommu *iommu;
     unsigned long gfn;
     p2m_type_t p2mt;
@@ -383,12 +383,10 @@ static int do_completion_wait(struct dom
         unmap_domain_page(vaddr);
     }
 
-    com_wait_int_en = iommu_get_bit(iommu->reg_ctrl.lo,
-                                    IOMMU_CONTROL_COMP_WAIT_INT_SHIFT);
     com_wait_int = iommu_get_bit(iommu->reg_status.lo,
                                  IOMMU_STATUS_COMP_WAIT_INT_SHIFT);
 
-    if ( com_wait_int_en && com_wait_int )
+    if ( iommu->reg_ctrl.com_wait_int_en && com_wait_int )
         guest_iommu_deliver_msi(d);
 
     return 0;
@@ -550,40 +548,17 @@ static void guest_iommu_process_command(
     return;
 }
 
-static int guest_iommu_write_ctrl(struct guest_iommu *iommu, uint64_t newctrl)
+static int guest_iommu_write_ctrl(struct guest_iommu *iommu, uint64_t val)
 {
-    bool_t cmd_en, event_en, iommu_en, ppr_en, ppr_log_en;
-    bool_t cmd_en_old, event_en_old, iommu_en_old;
-    bool_t cmd_run;
-
-    iommu_en = iommu_get_bit(newctrl,
-                             IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT);
-    iommu_en_old = iommu_get_bit(iommu->reg_ctrl.lo,
-                                 IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT);
-
-    cmd_en = iommu_get_bit(newctrl,
-                           IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT);
-    cmd_en_old = iommu_get_bit(iommu->reg_ctrl.lo,
-                               IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT);
-    cmd_run = iommu_get_bit(iommu->reg_status.lo,
-                            IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT);
-    event_en = iommu_get_bit(newctrl,
-                             IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT);
-    event_en_old = iommu_get_bit(iommu->reg_ctrl.lo,
-                                 IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT);
-
-    ppr_en = iommu_get_bit(newctrl,
-                           IOMMU_CONTROL_PPR_ENABLE_SHIFT);
-    ppr_log_en = iommu_get_bit(newctrl,
-                               IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT);
+    union amd_iommu_control newctrl = { .raw = val };
 
-    if ( iommu_en )
+    if ( newctrl.iommu_en )
     {
         guest_iommu_enable(iommu);
         guest_iommu_enable_dev_table(iommu);
     }
 
-    if ( iommu_en && cmd_en )
+    if ( newctrl.iommu_en && newctrl.cmd_buf_en )
     {
         guest_iommu_enable_ring_buffer(iommu, &iommu->cmd_buffer,
                                        sizeof(cmd_entry_t));
@@ -591,7 +566,7 @@ static int guest_iommu_write_ctrl(struct
         tasklet_schedule(&iommu->cmd_buffer_tasklet);
     }
 
-    if ( iommu_en && event_en )
+    if ( newctrl.iommu_en && newctrl.event_log_en )
     {
         guest_iommu_enable_ring_buffer(iommu, &iommu->event_log,
                                        sizeof(event_entry_t));
@@ -599,7 +574,7 @@ static int guest_iommu_write_ctrl(struct
         guest_iommu_clear_status(iommu, IOMMU_STATUS_EVENT_OVERFLOW_SHIFT);
     }
 
-    if ( iommu_en && ppr_en && ppr_log_en )
+    if ( newctrl.iommu_en && newctrl.ppr_en && newctrl.ppr_log_en )
     {
         guest_iommu_enable_ring_buffer(iommu, &iommu->ppr_log,
                                        sizeof(ppr_entry_t));
@@ -607,19 +582,21 @@ static int guest_iommu_write_ctrl(struct
         guest_iommu_clear_status(iommu, IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT);
     }
 
-    if ( iommu_en && cmd_en_old && !cmd_en )
+    if ( newctrl.iommu_en && iommu->reg_ctrl.cmd_buf_en &&
+         !newctrl.cmd_buf_en )
     {
         /* Disable iommu command processing */
         tasklet_kill(&iommu->cmd_buffer_tasklet);
     }
 
-    if ( event_en_old && !event_en )
+    if ( iommu->reg_ctrl.event_log_en && !newctrl.event_log_en )
         guest_iommu_clear_status(iommu, IOMMU_STATUS_EVENT_LOG_RUN_SHIFT);
 
-    if ( iommu_en_old && !iommu_en )
+    if ( iommu->reg_ctrl.iommu_en && !newctrl.iommu_en )
         guest_iommu_disable(iommu);
 
-    u64_to_reg(&iommu->reg_ctrl, newctrl);
+    iommu->reg_ctrl = newctrl;
+
     return 0;
 }
 
@@ -661,7 +638,7 @@ static uint64_t iommu_mmio_read64(struct
         val = reg_to_u64(iommu->ppr_log.reg_tail);
         break;
     case IOMMU_CONTROL_MMIO_OFFSET:
-        val = reg_to_u64(iommu->reg_ctrl);
+        val = iommu->reg_ctrl.raw;
         break;
     case IOMMU_STATUS_MMIO_OFFSET:
         val = reg_to_u64(iommu->reg_status);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -41,7 +41,7 @@ struct list_head amd_iommu_head;
 struct table_struct device_table;
 bool_t iommuv2_enabled;
 
-static int iommu_has_ht_flag(struct amd_iommu *iommu, u8 mask)
+static bool iommu_has_ht_flag(struct amd_iommu *iommu, u8 mask)
 {
     return iommu->ht_flags & mask;
 }
@@ -69,31 +69,18 @@ static void __init unmap_iommu_mmio_regi
 
 static void set_iommu_ht_flags(struct amd_iommu *iommu)
 {
-    u32 entry;
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-
     /* Setup HT flags */
     if ( iommu_has_cap(iommu, PCI_CAP_HT_TUNNEL_SHIFT) )
-        iommu_has_ht_flag(iommu, ACPI_IVHD_TT_ENABLE) ?
-            iommu_set_bit(&entry, IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT) :
-            iommu_clear_bit(&entry, IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT);
-
-    iommu_has_ht_flag(iommu, ACPI_IVHD_RES_PASS_PW) ?
-        iommu_set_bit(&entry, IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT):
-        iommu_clear_bit(&entry, IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT);
-
-    iommu_has_ht_flag(iommu, ACPI_IVHD_ISOC) ?
-        iommu_set_bit(&entry, IOMMU_CONTROL_ISOCHRONOUS_SHIFT):
-        iommu_clear_bit(&entry, IOMMU_CONTROL_ISOCHRONOUS_SHIFT);
-
-    iommu_has_ht_flag(iommu, ACPI_IVHD_PASS_PW) ?
-        iommu_set_bit(&entry, IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT):
-        iommu_clear_bit(&entry, IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT);
+        iommu->ctrl.ht_tun_en = iommu_has_ht_flag(iommu, ACPI_IVHD_TT_ENABLE);
+
+    iommu->ctrl.pass_pw     = iommu_has_ht_flag(iommu, ACPI_IVHD_PASS_PW);
+    iommu->ctrl.res_pass_pw = iommu_has_ht_flag(iommu, ACPI_IVHD_RES_PASS_PW);
+    iommu->ctrl.isoc        = iommu_has_ht_flag(iommu, ACPI_IVHD_ISOC);
 
     /* Force coherent */
-    iommu_set_bit(&entry, IOMMU_CONTROL_COHERENT_SHIFT);
+    iommu->ctrl.coherent = true;
 
-    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 }
 
 static void register_iommu_dev_table_in_mmio_space(struct amd_iommu *iommu)
@@ -205,55 +192,37 @@ static void register_iommu_ppr_log_in_mm
 
 
 static void set_iommu_translation_control(struct amd_iommu *iommu,
-                                                 int enable)
+                                          bool enable)
 {
-    u32 entry;
+    iommu->ctrl.iommu_en = enable;
 
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-
-    enable ?
-        iommu_set_bit(&entry, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT) :
-        iommu_clear_bit(&entry, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT);
-
-    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 }
 
 static void set_iommu_guest_translation_control(struct amd_iommu *iommu,
-                                                int enable)
+                                                bool enable)
 {
-    u32 entry;
-
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+    iommu->ctrl.gt_en = enable;
 
-    enable ?
-        iommu_set_bit(&entry, IOMMU_CONTROL_GT_ENABLE_SHIFT) :
-        iommu_clear_bit(&entry, IOMMU_CONTROL_GT_ENABLE_SHIFT);
-
-    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 
     if ( enable )
         AMD_IOMMU_DEBUG("Guest Translation Enabled.\n");
 }
 
 static void set_iommu_command_buffer_control(struct amd_iommu *iommu,
-                                                    int enable)
+                                             bool enable)
 {
-    u32 entry;
-
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-
-    /*reset head and tail pointer manually before enablement */
+    /* Reset head and tail pointer manually before enablement */
     if ( enable )
     {
         writeq(0, iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET);
         writeq(0, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET);
-
-        iommu_set_bit(&entry, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT);
     }
-    else
-        iommu_clear_bit(&entry, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT);
 
-    writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+    iommu->ctrl.cmd_buf_en = enable;
+
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 }
 
 static void register_iommu_exclusion_range(struct amd_iommu *iommu)
@@ -295,57 +264,38 @@ static void register_iommu_exclusion_ran
 }
 
 static void set_iommu_event_log_control(struct amd_iommu *iommu,
-            int enable)
+                                        bool enable)
 {
-    u32 entry;
-
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-
-    /*reset head and tail pointer manually before enablement */
+    /* Reset head and tail pointer manually before enablement */
     if ( enable )
     {
         writeq(0, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET);
         writeq(0, iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET);
-
-        iommu_set_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT);
-        iommu_set_bit(&entry, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT);
-    }
-    else
-    {
-        iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT);
-        iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT);
     }
 
-    iommu_clear_bit(&entry, IOMMU_CONTROL_COMP_WAIT_INT_SHIFT);
+    iommu->ctrl.event_int_en = enable;
+    iommu->ctrl.event_log_en = enable;
+    iommu->ctrl.com_wait_int_en = false;
 
-    writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 }
 
 static void set_iommu_ppr_log_control(struct amd_iommu *iommu,
-                                      int enable)
+                                      bool enable)
 {
-    u32 entry;
-
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-
-    /*reset head and tail pointer manually before enablement */
+    /* Reset head and tail pointer manually before enablement */
     if ( enable )
     {
         writeq(0, iommu->mmio_base + IOMMU_PPR_LOG_HEAD_OFFSET);
         writeq(0, iommu->mmio_base + IOMMU_PPR_LOG_TAIL_OFFSET);
-
-        iommu_set_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT);
-        iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
-        iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT);
-    }
-    else
-    {
-        iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT);
-        iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
-        iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT);
     }
 
-    writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+    iommu->ctrl.ppr_en = enable;
+    iommu->ctrl.ppr_int_en = enable;
+    iommu->ctrl.ppr_log_en = enable;
+
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+
     if ( enable )
         AMD_IOMMU_DEBUG("PPR Log Enabled.\n");
 }
@@ -398,7 +348,7 @@ static int iommu_read_log(struct amd_iom
 /* reset event log or ppr log when overflow */
 static void iommu_reset_log(struct amd_iommu *iommu,
                             struct ring_buffer *log,
-                            void (*ctrl_func)(struct amd_iommu *iommu, int))
+                            void (*ctrl_func)(struct amd_iommu *iommu, bool))
 {
     u32 entry;
     int log_run, run_bit;
@@ -615,11 +565,11 @@ static void iommu_check_event_log(struct
         iommu_reset_log(iommu, &iommu->event_log, set_iommu_event_log_control);
     else
     {
-        entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-        if ( !(entry & IOMMU_CONTROL_EVENT_LOG_INT_MASK) )
+        if ( !iommu->ctrl.event_int_en )
         {
-            entry |= IOMMU_CONTROL_EVENT_LOG_INT_MASK;
-            writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+            iommu->ctrl.event_int_en = true;
+            writeq(iommu->ctrl.raw,
+                   iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
             /*
              * Re-schedule the tasklet to handle eventual log entries added
              * between reading the log above and re-enabling the interrupt.
@@ -704,11 +654,11 @@ static void iommu_check_ppr_log(struct a
         iommu_reset_log(iommu, &iommu->ppr_log, set_iommu_ppr_log_control);
     else
     {
-        entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-        if ( !(entry & IOMMU_CONTROL_PPR_LOG_INT_MASK) )
+        if ( !iommu->ctrl.ppr_int_en )
         {
-            entry |= IOMMU_CONTROL_PPR_LOG_INT_MASK;
-            writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+            iommu->ctrl.ppr_int_en = true;
+            writeq(iommu->ctrl.raw,
+                   iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
             /*
              * Re-schedule the tasklet to handle eventual log entries added
              * between reading the log above and re-enabling the interrupt.
@@ -754,7 +704,6 @@ static void do_amd_iommu_irq(unsigned lo
 static void iommu_interrupt_handler(int irq, void *dev_id,
                                     struct cpu_user_regs *regs)
 {
-    u32 entry;
     unsigned long flags;
     struct amd_iommu *iommu = dev_id;
 
@@ -764,10 +713,9 @@ static void iommu_interrupt_handler(int
      * Silence interrupts from both event and PPR by clearing the
      * enable logging bits in the control register
      */
-    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
-    iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT);
-    iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT);
-    writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+    iommu->ctrl.event_int_en = false;
+    iommu->ctrl.ppr_int_en = false;
+    writeq(iommu->ctrl.raw, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 
     spin_unlock_irqrestore(&iommu->lock, flags);
 
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -93,6 +93,8 @@ struct amd_iommu {
     void *mmio_base;
     unsigned long mmio_base_phys;
 
+    union amd_iommu_control ctrl;
+
     struct table_struct dev_table;
     struct ring_buffer cmd_buffer;
     struct ring_buffer event_log;
@@ -178,7 +180,7 @@ struct guest_iommu {
     uint64_t                mmio_base;             /* MMIO base address */
 
     /* MMIO regs */
-    struct mmio_reg         reg_ctrl;              /* MMIO offset 0018h */
+    union amd_iommu_control reg_ctrl;              /* MMIO offset 0018h */
     struct mmio_reg         reg_status;            /* MMIO offset 2020h */
     union amd_iommu_ext_features reg_ext_feature;  /* MMIO offset 0030h */
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -311,38 +311,56 @@
 
 /* Control Register */
 #define IOMMU_CONTROL_MMIO_OFFSET			0x18
-#define IOMMU_CONTROL_TRANSLATION_ENABLE_MASK		0x00000001
-#define IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT		0
-#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK	0x00000002
-#define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT	1
-#define IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK		0x00000004
-#define IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT		2
-#define IOMMU_CONTROL_EVENT_LOG_INT_MASK		0x00000008
-#define IOMMU_CONTROL_EVENT_LOG_INT_SHIFT		3
-#define IOMMU_CONTROL_COMP_WAIT_INT_MASK		0x00000010
-#define IOMMU_CONTROL_COMP_WAIT_INT_SHIFT		4
-#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_MASK		0x000000E0
-#define IOMMU_CONTROL_INVALIDATION_TIMEOUT_SHIFT	5
-#define IOMMU_CONTROL_PASS_POSTED_WRITE_MASK		0x00000100
-#define IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT		8
-#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK	0x00000200
-#define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT	9
-#define IOMMU_CONTROL_COHERENT_MASK			0x00000400
-#define IOMMU_CONTROL_COHERENT_SHIFT			10
-#define IOMMU_CONTROL_ISOCHRONOUS_MASK			0x00000800
-#define IOMMU_CONTROL_ISOCHRONOUS_SHIFT			11
-#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK	0x00001000
-#define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT	12
-#define IOMMU_CONTROL_PPR_LOG_ENABLE_MASK		0x00002000
-#define IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT		13
-#define IOMMU_CONTROL_PPR_LOG_INT_MASK			0x00004000
-#define IOMMU_CONTROL_PPR_LOG_INT_SHIFT			14
-#define IOMMU_CONTROL_PPR_ENABLE_MASK			0x00008000
-#define IOMMU_CONTROL_PPR_ENABLE_SHIFT			15
-#define IOMMU_CONTROL_GT_ENABLE_MASK			0x00010000
-#define IOMMU_CONTROL_GT_ENABLE_SHIFT			16
-#define IOMMU_CONTROL_RESTART_MASK			0x80000000
-#define IOMMU_CONTROL_RESTART_SHIFT			31
+
+union amd_iommu_control {
+    uint64_t raw;
+    struct {
+        bool iommu_en:1;
+        bool ht_tun_en:1;
+        bool event_log_en:1;
+        bool event_int_en:1;
+        bool com_wait_int_en:1;
+        unsigned int inv_timeout:3;
+        bool pass_pw:1;
+        bool res_pass_pw:1;
+        bool coherent:1;
+        bool isoc:1;
+        bool cmd_buf_en:1;
+        bool ppr_log_en:1;
+        bool ppr_int_en:1;
+        bool ppr_en:1;
+        bool gt_en:1;
+        bool ga_en:1;
+        unsigned int crw:4;
+        bool smif_en:1;
+        bool slf_wb_dis:1;
+        bool smif_log_en:1;
+        unsigned int gam_en:3;
+        bool ga_log_en:1;
+        bool ga_int_en:1;
+        unsigned int dual_ppr_log_en:2;
+        unsigned int dual_event_log_en:2;
+        unsigned int dev_tbl_seg_en:3;
+        unsigned int priv_abrt_en:2;
+        bool ppr_auto_rsp_en:1;
+        bool marc_en:1;
+        bool blk_stop_mrk_en:1;
+        bool ppr_auto_rsp_aon:1;
+        bool domain_id_pne:1;
+        unsigned int :1;
+        bool eph_en:1;
+        unsigned int had_update:2;
+        bool gd_update_dis:1;
+        unsigned int :1;
+        bool xt_en:1;
+        bool int_cap_xt_en:1;
+        bool vcmd_en:1;
+        bool viommu_en:1;
+        bool ga_update_dis:1;
+        bool gappi_en:1;
+        unsigned int :8;
+    };
+};
 
 /* Exclusion Register */
 #define IOMMU_EXCLUSION_BASE_LOW_OFFSET		0x20
++++++ 5d41785b-AMD-IOMMU-bitfield-IRTE.patch ++++++

References: bsc#1135799

# Commit b75623c5062b741248547862a3e1b7d4b129c128
# Date 2019-07-31 13:15:39 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: use bit field for IRTE

At the same time restrict its scope to just the single source file
actually using it, and abstract accesses by introducing a union of
pointers. (A union of the actual table entries is not used to make it
impossible to [wrongly, once the 128-bit form gets added] perform
pointer arithmetic / array accesses on derived types.)

Also move away from updating the entries piecemeal: Construct a full new
entry, and write it out.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -24,6 +24,26 @@
 #include 
 #include 
 
+union irte32 {
+    uint32_t raw;
+    struct {
+        bool remap_en:1;
+        bool sup_io_pf:1;
+        unsigned int int_type:3;
+        bool rq_eoi:1;
+        bool dm:1;
+        bool guest_mode:1; /* MBZ */
+        unsigned int dest:8;
+        unsigned int vector:8;
+        unsigned int :8;
+    } flds;
+};
+
+union irte_ptr {
+    void *ptr;
+    union irte32 *ptr32;
+};
+
 #define INTREMAP_TABLE_ORDER    1
 #define INTREMAP_LENGTH 0xB
 #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
@@ -102,47 +122,45 @@ static unsigned int alloc_intremap_entry
     return slot;
 }
 
-static u32 *get_intremap_entry(int seg, int bdf, int offset)
+static union irte_ptr get_intremap_entry(unsigned int seg, unsigned int bdf,
+                                         unsigned int index)
 {
-    u32 *table = get_ivrs_mappings(seg)[bdf].intremap_table;
+    union irte_ptr table = {
+        .ptr = get_ivrs_mappings(seg)[bdf].intremap_table
+    };
+
+    ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
 
-    ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) );
+    table.ptr32 += index;
 
-    return table + offset;
+    return table;
 }
 
-static void free_intremap_entry(int seg, int bdf, int offset)
-{
-    u32 *entry = get_intremap_entry(seg, bdf, offset);
-
-    memset(entry, 0, sizeof(u32));
-    __clear_bit(offset, get_ivrs_mappings(seg)[bdf].intremap_inuse);
-}
-
-static void update_intremap_entry(u32* entry, u8 vector, u8 int_type,
-    u8 dest_mode, u8 dest)
-{
-    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
-                            INT_REMAP_ENTRY_REMAPEN_MASK,
-                            INT_REMAP_ENTRY_REMAPEN_SHIFT, entry);
-    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry,
-                            INT_REMAP_ENTRY_SUPIOPF_MASK,
-                            INT_REMAP_ENTRY_SUPIOPF_SHIFT, entry);
-    set_field_in_reg_u32(int_type, *entry,
-                            INT_REMAP_ENTRY_INTTYPE_MASK,
-                            INT_REMAP_ENTRY_INTTYPE_SHIFT, entry);
-    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry,
-                            INT_REMAP_ENTRY_REQEOI_MASK,
-                            INT_REMAP_ENTRY_REQEOI_SHIFT, entry);
-    set_field_in_reg_u32((u32)dest_mode, *entry,
-                            INT_REMAP_ENTRY_DM_MASK,
-                            INT_REMAP_ENTRY_DM_SHIFT, entry);
-    set_field_in_reg_u32((u32)dest, *entry,
-                            INT_REMAP_ENTRY_DEST_MAST,
-                            INT_REMAP_ENTRY_DEST_SHIFT, entry);
-    set_field_in_reg_u32((u32)vector, *entry,
-                            INT_REMAP_ENTRY_VECTOR_MASK,
-                            INT_REMAP_ENTRY_VECTOR_SHIFT, entry);
+static void free_intremap_entry(unsigned int seg, unsigned int bdf,
+                                unsigned int index)
+{
+    union irte_ptr entry = get_intremap_entry(seg, bdf, index);
+
+    ACCESS_ONCE(entry.ptr32->raw) = 0;
+
+    __clear_bit(index, get_ivrs_mappings(seg)[bdf].intremap_inuse);
+}
+
+static void update_intremap_entry(union irte_ptr entry, unsigned int vector,
+                                  unsigned int int_type,
+                                  unsigned int dest_mode, unsigned int dest)
+{
+    union irte32 irte = {
+        .flds = {
+            .remap_en = true,
+            .int_type = int_type,
+            .dm = dest_mode,
+            .dest = dest,
+            .vector = vector,
+        },
+    };
+
+    ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
 }
 
 static inline int get_rte_index(const struct IO_APIC_route_entry *rte)
@@ -164,7 +182,7 @@ static int update_intremap_entry_from_io
     u16 *index)
 {
     unsigned long flags;
-    u32* entry;
+    union irte_ptr entry;
     u8 delivery_mode, dest, vector, dest_mode;
     int req_id;
     spinlock_t *lock;
@@ -202,12 +220,8 @@ static int update_intremap_entry_from_io
          * so need to recover vector and delivery mode from IRTE.
          */
         ASSERT(get_rte_index(rte) == offset);
-        vector = get_field_from_reg_u32(*entry,
-                                        INT_REMAP_ENTRY_VECTOR_MASK,
-                                        INT_REMAP_ENTRY_VECTOR_SHIFT);
-        delivery_mode = get_field_from_reg_u32(*entry,
-                                               INT_REMAP_ENTRY_INTTYPE_MASK,
-                                               INT_REMAP_ENTRY_INTTYPE_SHIFT);
+        vector = entry.ptr32->flds.vector;
+        delivery_mode = entry.ptr32->flds.int_type;
     }
     update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
 
@@ -229,7 +243,7 @@ int __init amd_iommu_setup_ioapic_remapp
 {
     struct IO_APIC_route_entry rte;
     unsigned long flags;
-    u32* entry;
+    union irte_ptr entry;
     int apic, pin;
     u8 delivery_mode, dest, vector, dest_mode;
     u16 seg, bdf, req_id;
@@ -408,16 +422,14 @@ unsigned int amd_iommu_read_ioapic_from_
         u16 bdf = ioapic_sbdf[idx].bdf;
         u16 seg = ioapic_sbdf[idx].seg;
         u16 req_id = get_intremap_requestor_id(seg, bdf);
-        const u32 *entry = get_intremap_entry(seg, req_id, offset);
+        union irte_ptr entry = get_intremap_entry(seg, req_id, offset);
 
         ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
         val &= ~(INTREMAP_ENTRIES - 1);
-        val |= get_field_from_reg_u32(*entry,
-                                      INT_REMAP_ENTRY_INTTYPE_MASK,
-                                      INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8;
-        val |= get_field_from_reg_u32(*entry,
-                                      INT_REMAP_ENTRY_VECTOR_MASK,
-                                      INT_REMAP_ENTRY_VECTOR_SHIFT);
+        val |= MASK_INSR(entry.ptr32->flds.int_type,
+                         IO_APIC_REDIR_DELIV_MODE_MASK);
+        val |= MASK_INSR(entry.ptr32->flds.vector,
+                         IO_APIC_REDIR_VECTOR_MASK);
     }
 
     return val;
@@ -428,7 +440,7 @@ static int update_intremap_entry_from_ms
     int *remap_index, const struct msi_msg *msg, u32 *data)
 {
     unsigned long flags;
-    u32* entry;
+    union irte_ptr entry;
     u16 req_id, alias_id;
     u8 delivery_mode, dest, vector, dest_mode;
     spinlock_t *lock;
@@ -582,7 +594,7 @@ void amd_iommu_read_msi_from_ire(
     const struct pci_dev *pdev = msi_desc->dev;
     u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
     u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
-    const u32 *entry;
+    union irte_ptr entry;
 
     if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) )
         return;
@@ -598,12 +610,10 @@ void amd_iommu_read_msi_from_ire(
     }
 
     msg->data &= ~(INTREMAP_ENTRIES - 1);
-    msg->data |= get_field_from_reg_u32(*entry,
-                                        INT_REMAP_ENTRY_INTTYPE_MASK,
-                                        INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8;
-    msg->data |= get_field_from_reg_u32(*entry,
-                                        INT_REMAP_ENTRY_VECTOR_MASK,
-                                        INT_REMAP_ENTRY_VECTOR_SHIFT);
+    msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
+                           MSI_DATA_DELIVERY_MODE_MASK);
+    msg->data |= MASK_INSR(entry.ptr32->flds.vector,
+                           MSI_DATA_VECTOR_MASK);
 }
 
 int __init amd_iommu_free_intremap_table(
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -502,22 +502,6 @@ union amd_iommu_ext_features {
 #define IOMMU_CONTROL_DISABLED	0
 #define IOMMU_CONTROL_ENABLED	1
 
-/* interrupt remapping table */
-#define INT_REMAP_ENTRY_REMAPEN_MASK    0x00000001
-#define INT_REMAP_ENTRY_REMAPEN_SHIFT   0
-#define INT_REMAP_ENTRY_SUPIOPF_MASK    0x00000002
-#define INT_REMAP_ENTRY_SUPIOPF_SHIFT   1
-#define INT_REMAP_ENTRY_INTTYPE_MASK    0x0000001C
-#define INT_REMAP_ENTRY_INTTYPE_SHIFT   2
-#define INT_REMAP_ENTRY_REQEOI_MASK     0x00000020
-#define INT_REMAP_ENTRY_REQEOI_SHIFT    5
-#define INT_REMAP_ENTRY_DM_MASK         0x00000040
-#define INT_REMAP_ENTRY_DM_SHIFT        6
-#define INT_REMAP_ENTRY_DEST_MAST       0x0000FF00
-#define INT_REMAP_ENTRY_DEST_SHIFT      8
-#define INT_REMAP_ENTRY_VECTOR_MASK     0x00FF0000
-#define INT_REMAP_ENTRY_VECTOR_SHIFT    16
-
 #define INV_IOMMU_ALL_PAGES_ADDRESS      ((1ULL << 63) - 1)
 
 #define IOMMU_RING_BUFFER_PTR_MASK                  0x0007FFF0
--- a/xen/include/asm-x86/io_apic.h
+++ b/xen/include/asm-x86/io_apic.h
@@ -21,6 +21,7 @@
 
 /* I/O Unit Redirection Table */
 #define IO_APIC_REDIR_VECTOR_MASK   0x000FF
+#define IO_APIC_REDIR_DELIV_MODE_MASK 0x00700
 #define IO_APIC_REDIR_DEST_LOGICAL  0x00800
 #define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
 #define IO_APIC_REDIR_SEND_PENDING  (1 << 12)
++++++ 5d41787e-AMD-IOMMU-pass-IOMMU-to-gfu-intremap-entry.patch ++++++

References: bsc#1135799

# Commit d4919102cf31c070ccc908421ea24f4246b08eda
# Date 2019-07-31 13:16:14 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: pass IOMMU to {get,free,update}_intremap_entry()

The functions will want to know IOMMU properties (specifically the IRTE
size) subsequently.

Rather than introducing a second error path bogusly returning -E... from
amd_iommu_read_ioapic_from_ire(), also change the existing one to follow
VT-d in returning the raw (untranslated) IO-APIC RTE.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -122,11 +122,11 @@ static unsigned int alloc_intremap_entry
     return slot;
 }
 
-static union irte_ptr get_intremap_entry(unsigned int seg, unsigned int bdf,
-                                         unsigned int index)
+static union irte_ptr get_intremap_entry(const struct amd_iommu *iommu,
+                                         unsigned int bdf, unsigned int index)
 {
     union irte_ptr table = {
-        .ptr = get_ivrs_mappings(seg)[bdf].intremap_table
+        .ptr = get_ivrs_mappings(iommu->seg)[bdf].intremap_table
     };
 
     ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
@@ -136,18 +136,19 @@ static union irte_ptr get_intremap_entry
     return table;
 }
 
-static void free_intremap_entry(unsigned int seg, unsigned int bdf,
-                                unsigned int index)
+static void free_intremap_entry(const struct amd_iommu *iommu,
+                                unsigned int bdf, unsigned int index)
 {
-    union irte_ptr entry = get_intremap_entry(seg, bdf, index);
+    union irte_ptr entry = get_intremap_entry(iommu, bdf, index);
 
     ACCESS_ONCE(entry.ptr32->raw) = 0;
 
-    __clear_bit(index, get_ivrs_mappings(seg)[bdf].intremap_inuse);
+    __clear_bit(index, get_ivrs_mappings(iommu->seg)[bdf].intremap_inuse);
 }
 
-static void update_intremap_entry(union irte_ptr entry, unsigned int vector,
-                                  unsigned int int_type,
+static void update_intremap_entry(const struct amd_iommu *iommu,
+                                  union irte_ptr entry,
+                                  unsigned int vector, unsigned int int_type,
                                   unsigned int dest_mode, unsigned int dest)
 {
     union irte32 irte = {
@@ -212,7 +213,7 @@ static int update_intremap_entry_from_io
         lo_update = 1;
     }
 
-    entry = get_intremap_entry(iommu->seg, req_id, offset);
+    entry = get_intremap_entry(iommu, req_id, offset);
     if ( !lo_update )
     {
         /*
@@ -223,7 +224,7 @@ static int update_intremap_entry_from_io
         vector = entry.ptr32->flds.vector;
         delivery_mode = entry.ptr32->flds.int_type;
     }
-    update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
+    update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
 
     spin_unlock_irqrestore(lock, flags);
 
@@ -288,8 +289,8 @@ int __init amd_iommu_setup_ioapic_remapp
             spin_lock_irqsave(lock, flags);
             offset = alloc_intremap_entry(seg, req_id, 1);
             BUG_ON(offset >= INTREMAP_ENTRIES);
-            entry = get_intremap_entry(iommu->seg, req_id, offset);
-            update_intremap_entry(entry, vector,
+            entry = get_intremap_entry(iommu, req_id, offset);
+            update_intremap_entry(iommu, entry, vector,
                                   delivery_mode, dest_mode, dest);
             spin_unlock_irqrestore(lock, flags);
 
@@ -413,7 +414,7 @@ unsigned int amd_iommu_read_ioapic_from_
 
     idx = ioapic_id_to_index(IO_APIC_ID(apic));
     if ( idx == MAX_IO_APICS )
-        return -EINVAL;
+        return val;
 
     offset = ioapic_sbdf[idx].pin_2_idx[pin];
 
@@ -422,9 +423,13 @@ unsigned int amd_iommu_read_ioapic_from_
         u16 bdf = ioapic_sbdf[idx].bdf;
         u16 seg = ioapic_sbdf[idx].seg;
         u16 req_id = get_intremap_requestor_id(seg, bdf);
-        union irte_ptr entry = get_intremap_entry(seg, req_id, offset);
+        const struct amd_iommu *iommu = find_iommu_for_device(seg, bdf);
+        union irte_ptr entry;
 
+        if ( !iommu )
+            return val;
         ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
+        entry = get_intremap_entry(iommu, req_id, offset);
         val &= ~(INTREMAP_ENTRIES - 1);
         val |= MASK_INSR(entry.ptr32->flds.int_type,
                          IO_APIC_REDIR_DELIV_MODE_MASK);
@@ -454,7 +459,7 @@ static int update_intremap_entry_from_ms
         lock = get_intremap_lock(iommu->seg, req_id);
         spin_lock_irqsave(lock, flags);
         for ( i = 0; i < nr; ++i )
-            free_intremap_entry(iommu->seg, req_id, *remap_index + i);
+            free_intremap_entry(iommu, req_id, *remap_index + i);
         spin_unlock_irqrestore(lock, flags);
         goto done;
     }
@@ -479,8 +484,8 @@ static int update_intremap_entry_from_ms
         *remap_index = offset;
     }
 
-    entry = get_intremap_entry(iommu->seg, req_id, offset);
-    update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest);
+    entry = get_intremap_entry(iommu, req_id, offset);
+    update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
     *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset;
@@ -594,12 +599,13 @@ void amd_iommu_read_msi_from_ire(
     const struct pci_dev *pdev = msi_desc->dev;
     u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
     u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
+    const struct amd_iommu *iommu = _find_iommu_for_device(seg, bdf);
     union irte_ptr entry;
 
-    if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) )
+    if ( IS_ERR_OR_NULL(iommu) )
         return;
 
-    entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset);
+    entry = get_intremap_entry(iommu, get_dma_requestor_id(seg, bdf), offset);
 
     if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
     {
++++++ 5d4178ad-AMD-IOMMU-128bit-non-guest-APIC-IRTE.patch ++++++

References: bsc#1135799

# Commit 8f3c86381c3f3b58e77c45db7cfa5c31525f397c
# Date 2019-07-31 13:17:01 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: introduce 128-bit IRTE non-guest-APIC IRTE format

This is in preparation of actually enabling x2APIC mode, which requires
this wider IRTE format to be used.

A specific remark regarding the first hunk changing
amd_iommu_ioapic_update_ire(): This bypass was introduced for XSA-36,
i.e. by 94d4a1119d ("AMD,IOMMU: Clean up old entries in remapping
tables when creating new one"). Other code introduced by that change has
meanwhile disappeared or further changed, and I wonder if - rather than
adding an x2apic_enabled check to the conditional - the bypass couldn't
be deleted altogether. For now the goal is to affect the non-x2APIC
paths as little as possible.

Take the liberty and use the new "fresh" flag to suppress an unneeded
flush in update_intremap_entry_from_ioapic().

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -39,12 +39,36 @@ union irte32 {
     } flds;
 };
 
+union irte128 {
+    uint64_t raw[2];
+    struct {
+        bool remap_en:1;
+        bool sup_io_pf:1;
+        unsigned int int_type:3;
+        bool rq_eoi:1;
+        bool dm:1;
+        bool guest_mode:1; /* MBZ */
+        unsigned int dest_lo:24;
+        unsigned int :32;
+        unsigned int vector:8;
+        unsigned int :24;
+        unsigned int :24;
+        unsigned int dest_hi:8;
+    } full;
+};
+
 union irte_ptr {
     void *ptr;
     union irte32 *ptr32;
+    union irte128 *ptr128;
 };
 
-#define INTREMAP_TABLE_ORDER    1
+union irte_cptr {
+    const void *ptr;
+    const union irte32 *ptr32;
+    const union irte128 *ptr128;
+} __transparent__;
+
 #define INTREMAP_LENGTH 0xB
 #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
 
@@ -57,6 +81,13 @@ unsigned int nr_ioapic_sbdf;
 
 static void dump_intremap_tables(unsigned char key);
 
+static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
+{
+    return iommu->ctrl.ga_en
+           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+}
+
 unsigned int ioapic_id_to_index(unsigned int apic_id)
 {
     unsigned int idx;
@@ -131,7 +162,10 @@ static union irte_ptr get_intremap_entry
 
     ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
 
-    table.ptr32 += index;
+    if ( iommu->ctrl.ga_en )
+        table.ptr128 += index;
+    else
+        table.ptr32 += index;
 
     return table;
 }
@@ -141,7 +175,22 @@ static void free_intremap_entry(const st
 {
     union irte_ptr entry = get_intremap_entry(iommu, bdf, index);
 
-    ACCESS_ONCE(entry.ptr32->raw) = 0;
+    if ( iommu->ctrl.ga_en )
+    {
+        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        /*
+         * Low half (containing RemapEn) needs to be cleared first.  Note that
+         * strictly speaking smp_wmb() isn't enough, as conceptually it expands
+         * to just barrier() when !CONFIG_SMP.  But wmb() would be more than we
+         * need, since the IOMMU is a cache-coherent entity on the bus.  And
+         * given that we don't allow CONFIG_SMP to be turned off, the SMP
+         * variant will do.
+         */
+        smp_wmb();
+        entry.ptr128->raw[1] = 0;
+    }
+    else
+        ACCESS_ONCE(entry.ptr32->raw) = 0;
 
     __clear_bit(index, get_ivrs_mappings(iommu->seg)[bdf].intremap_inuse);
 }
@@ -151,17 +200,44 @@ static void update_intremap_entry(const
                                   unsigned int vector, unsigned int int_type,
                                   unsigned int dest_mode, unsigned int dest)
 {
-    union irte32 irte = {
-        .flds = {
-            .remap_en = true,
-            .int_type = int_type,
-            .dm = dest_mode,
-            .dest = dest,
-            .vector = vector,
-        },
-    };
+    if ( iommu->ctrl.ga_en )
+    {
+        union irte128 irte = {
+            .full = {
+                .remap_en = true,
+                .int_type = int_type,
+                .dm = dest_mode,
+                .dest_lo = dest,
+                .dest_hi = dest >> 24,
+                .vector = vector,
+            },
+        };
+
+        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        /*
+         * Low half, in particular RemapEn, needs to be cleared first.  See
+         * comment in free_intremap_entry() regarding the choice of barrier.
+         */
+        smp_wmb();
+        entry.ptr128->raw[1] = irte.raw[1];
+        /* High half needs to be set before low one (containing RemapEn). */
+        smp_wmb();
+        ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
+    }
+    else
+    {
+        union irte32 irte = {
+            .flds = {
+                .remap_en = true,
+                .int_type = int_type,
+                .dm = dest_mode,
+                .dest = dest,
+                .vector = vector,
+            },
+        };
 
-    ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
+        ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
+    }
 }
 
 static inline int get_rte_index(const struct IO_APIC_route_entry *rte)
@@ -175,6 +251,11 @@ static inline void set_rte_index(struct
     rte->delivery_mode = offset >> 8;
 }
 
+static inline unsigned int get_full_dest(const union irte128 *entry)
+{
+    return entry->full.dest_lo | ((unsigned int)entry->full.dest_hi << 24);
+}
+
 static int update_intremap_entry_from_ioapic(
     int bdf,
     struct amd_iommu *iommu,
@@ -184,10 +265,11 @@ static int update_intremap_entry_from_io
 {
     unsigned long flags;
     union irte_ptr entry;
-    u8 delivery_mode, dest, vector, dest_mode;
+    uint8_t delivery_mode, vector, dest_mode;
     int req_id;
     spinlock_t *lock;
-    unsigned int offset;
+    unsigned int dest, offset;
+    bool fresh = false;
 
     req_id = get_intremap_requestor_id(iommu->seg, bdf);
     lock = get_intremap_lock(iommu->seg, req_id);
@@ -195,7 +277,7 @@ static int update_intremap_entry_from_io
     delivery_mode = rte->delivery_mode;
     vector = rte->vector;
     dest_mode = rte->dest_mode;
-    dest = rte->dest.logical.logical_dest;
+    dest = x2apic_enabled ? rte->dest.dest32 : rte->dest.logical.logical_dest;
 
     spin_lock_irqsave(lock, flags);
 
@@ -210,25 +292,40 @@ static int update_intremap_entry_from_io
             return -ENOSPC;
         }
         *index = offset;
-        lo_update = 1;
+        fresh = true;
     }
 
     entry = get_intremap_entry(iommu, req_id, offset);
-    if ( !lo_update )
+    if ( fresh )
+        /* nothing */;
+    else if ( !lo_update )
     {
         /*
          * Low half of incoming RTE is already in remapped format,
          * so need to recover vector and delivery mode from IRTE.
          */
         ASSERT(get_rte_index(rte) == offset);
-        vector = entry.ptr32->flds.vector;
+        if ( iommu->ctrl.ga_en )
+            vector = entry.ptr128->full.vector;
+        else
+            vector = entry.ptr32->flds.vector;
+        /* The IntType fields match for both formats. */
         delivery_mode = entry.ptr32->flds.int_type;
     }
+    else if ( x2apic_enabled )
+    {
+        /*
+         * High half of incoming RTE was read from the I/O APIC and hence may
+         * not hold the full destination, so need to recover full destination
+         * from IRTE.
+         */
+        dest = get_full_dest(entry.ptr128);
+    }
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
 
     spin_unlock_irqrestore(lock, flags);
 
-    if ( iommu->enabled )
+    if ( iommu->enabled && !fresh )
     {
         spin_lock_irqsave(&iommu->lock, flags);
         amd_iommu_flush_intremap(iommu, req_id);
@@ -286,6 +383,18 @@ int __init amd_iommu_setup_ioapic_remapp
             dest_mode = rte.dest_mode;
             dest = rte.dest.logical.logical_dest;
 
+            if ( iommu->ctrl.xt_en )
+            {
+                /*
+                 * In x2APIC mode we have no way of discovering the high 24
+                 * bits of the destination of an already enabled interrupt.
+                 * We come here earlier than for xAPIC mode, so no interrupts
+                 * should have been set up before.
+                 */
+                AMD_IOMMU_DEBUG("Unmasked IO-APIC#%u entry %u in x2APIC mode\n",
+                                IO_APIC_ID(apic), pin);
+            }
+
             spin_lock_irqsave(lock, flags);
             offset = alloc_intremap_entry(seg, req_id, 1);
             BUG_ON(offset >= INTREMAP_ENTRIES);
@@ -320,7 +429,8 @@ void amd_iommu_ioapic_update_ire(
     struct IO_APIC_route_entry new_rte = { 0 };
     unsigned int rte_lo = (reg & 1) ? reg - 1 : reg;
     unsigned int pin = (reg - 0x10) / 2;
-    int saved_mask, seg, bdf, rc;
+    int seg, bdf, rc;
+    bool saved_mask, fresh = false;
     struct amd_iommu *iommu;
     unsigned int idx;
 
@@ -362,12 +472,22 @@ void amd_iommu_ioapic_update_ire(
         *(((u32 *)&new_rte) + 1) = value;
     }
 
-    if ( new_rte.mask &&
-         ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
     {
         ASSERT(saved_mask);
-        __io_apic_write(apic, reg, value);
-        return;
+
+        /*
+         * There's nowhere except the IRTE to store a full 32-bit destination,
+         * so we may not bypass entry allocation and updating of the low RTE
+         * half in the (usual) case of the high RTE half getting written first.
+         */
+        if ( new_rte.mask && !x2apic_enabled )
+        {
+            __io_apic_write(apic, reg, value);
+            return;
+        }
+
+        fresh = true;
     }
 
     /* mask the interrupt while we change the intremap table */
@@ -396,8 +516,12 @@ void amd_iommu_ioapic_update_ire(
     if ( reg == rte_lo )
         return;
 
-    /* unmask the interrupt after we have updated the intremap table */
-    if ( !saved_mask )
+    /*
+     * Unmask the interrupt after we have updated the intremap table. Also
+     * write the low half if a fresh entry was allocated for a high half
+     * update in x2APIC mode.
+     */
+    if ( !saved_mask || (x2apic_enabled && fresh) )
     {
         old_rte.mask = saved_mask;
         __io_apic_write(apic, rte_lo, *((u32 *)&old_rte));
@@ -411,31 +535,40 @@ unsigned int amd_iommu_read_ioapic_from_
     unsigned int offset;
     unsigned int val = __io_apic_read(apic, reg);
     unsigned int pin = (reg - 0x10) / 2;
+    uint16_t seg, bdf, req_id;
+    const struct amd_iommu *iommu;
+    union irte_ptr entry;
 
     idx = ioapic_id_to_index(IO_APIC_ID(apic));
     if ( idx == MAX_IO_APICS )
         return val;
 
     offset = ioapic_sbdf[idx].pin_2_idx[pin];
+    if ( offset >= INTREMAP_ENTRIES )
+        return val;
 
-    if ( !(reg & 1) && offset < INTREMAP_ENTRIES )
-    {
-        u16 bdf = ioapic_sbdf[idx].bdf;
-        u16 seg = ioapic_sbdf[idx].seg;
-        u16 req_id = get_intremap_requestor_id(seg, bdf);
-        const struct amd_iommu *iommu = find_iommu_for_device(seg, bdf);
-        union irte_ptr entry;
+    seg = ioapic_sbdf[idx].seg;
+    bdf = ioapic_sbdf[idx].bdf;
+    iommu = find_iommu_for_device(seg, bdf);
+    if ( !iommu )
+        return val;
+    req_id = get_intremap_requestor_id(seg, bdf);
+    entry = get_intremap_entry(iommu, req_id, offset);
 
-        if ( !iommu )
-            return val;
+    if ( !(reg & 1) )
+    {
         ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        entry = get_intremap_entry(iommu, req_id, offset);
         val &= ~(INTREMAP_ENTRIES - 1);
+        /* The IntType fields match for both formats. */
         val |= MASK_INSR(entry.ptr32->flds.int_type,
                          IO_APIC_REDIR_DELIV_MODE_MASK);
-        val |= MASK_INSR(entry.ptr32->flds.vector,
+        val |= MASK_INSR(iommu->ctrl.ga_en
+                         ? entry.ptr128->full.vector
+                         : entry.ptr32->flds.vector,
                          IO_APIC_REDIR_VECTOR_MASK);
     }
+    else if ( x2apic_enabled )
+        val = get_full_dest(entry.ptr128);
 
     return val;
 }
@@ -447,9 +580,9 @@ static int update_intremap_entry_from_ms
     unsigned long flags;
     union irte_ptr entry;
     u16 req_id, alias_id;
-    u8 delivery_mode, dest, vector, dest_mode;
+    uint8_t delivery_mode, vector, dest_mode;
     spinlock_t *lock;
-    unsigned int offset, i;
+    unsigned int dest, offset, i;
 
     req_id = get_dma_requestor_id(iommu->seg, bdf);
     alias_id = get_intremap_requestor_id(iommu->seg, bdf);
@@ -470,7 +603,12 @@ static int update_intremap_entry_from_ms
     dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
     delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
     vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK;
-    dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff;
+
+    if ( x2apic_enabled )
+        dest = msg->dest32;
+    else
+        dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
+
     offset = *remap_index;
     if ( offset >= INTREMAP_ENTRIES )
     {
@@ -616,10 +754,21 @@ void amd_iommu_read_msi_from_ire(
     }
 
     msg->data &= ~(INTREMAP_ENTRIES - 1);
+    /* The IntType fields match for both formats. */
     msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                            MSI_DATA_DELIVERY_MODE_MASK);
-    msg->data |= MASK_INSR(entry.ptr32->flds.vector,
-                           MSI_DATA_VECTOR_MASK);
+    if ( iommu->ctrl.ga_en )
+    {
+        msg->data |= MASK_INSR(entry.ptr128->full.vector,
+                               MSI_DATA_VECTOR_MASK);
+        msg->dest32 = get_full_dest(entry.ptr128);
+    }
+    else
+    {
+        msg->data |= MASK_INSR(entry.ptr32->flds.vector,
+                               MSI_DATA_VECTOR_MASK);
+        msg->dest32 = entry.ptr32->flds.dest;
+    }
 }
 
 int __init amd_iommu_free_intremap_table(
@@ -631,7 +780,7 @@ int __init amd_iommu_free_intremap_table
 
     if ( tb )
     {
-        __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER);
+        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
         ivrs_mapping->intremap_table = NULL;
     }
 
@@ -641,10 +790,10 @@ int __init amd_iommu_free_intremap_table
 void *__init amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
-    void *tb;
-    tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
+    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+
     BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER));
+    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
     *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
     BUG_ON(*inuse_map == NULL);
     return tb;
@@ -685,18 +834,29 @@ int __init amd_setup_hpet_msi(struct msi
     return rc;
 }
 
-static void dump_intremap_table(const u32 *table)
+static void dump_intremap_table(const struct amd_iommu *iommu,
+                                union irte_cptr tbl)
 {
-    u32 count;
+    unsigned int count;
 
-    if ( !table )
+    if ( !tbl.ptr )
         return;
 
     for ( count = 0; count < INTREMAP_ENTRIES; count++ )
     {
-        if ( !table[count] )
-            continue;
-        printk("    IRTE[%03x] %08x\n", count, table[count]);
+        if ( iommu->ctrl.ga_en )
+        {
+            if ( !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1] )
+                continue;
+            printk("    IRTE[%03x] %016lx_%016lx\n",
+                   count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
+        }
+        else
+        {
+            if ( !tbl.ptr32[count].raw )
+                continue;
+            printk("    IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
+        }
     }
 }
 
@@ -714,7 +874,7 @@ static int dump_intremap_mapping(const s
            PCI_FUNC(ivrs_mapping->dte_requestor_id));
 
     spin_lock_irqsave(&(ivrs_mapping->intremap_lock), flags);
-    dump_intremap_table(ivrs_mapping->intremap_table);
+    dump_intremap_table(iommu, ivrs_mapping->intremap_table);
     spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags);
 
     process_pending_softirqs();
@@ -733,6 +893,8 @@ static void dump_intremap_tables(unsigne
     printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n");
 
     spin_lock_irqsave(&shared_intremap_lock, flags);
-    dump_intremap_table(shared_intremap_table);
+    dump_intremap_table(list_first_entry(&amd_iommu_head, struct amd_iommu,
+                                         list),
+                        shared_intremap_table);
     spin_unlock_irqrestore(&shared_intremap_lock, flags);
 }
++++++ 5d4178fc-AMD-IOMMU-split-amd_iommu_init_one.patch ++++++

References: bsc#1135799

# Commit 21600515305dbecd2ae70d7b2ce936a9c200475f
# Date 2019-07-31 13:18:20 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: split amd_iommu_init_one()

Mapping the MMIO space and obtaining feature information needs to happen
slightly earlier, such that for x2APIC support we can set XTEn prior to
calling amd_iommu_update_ivrs_mapping_acpi() and
amd_iommu_setup_ioapic_remapping().

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -972,14 +972,6 @@ static int __init amd_iommu_init_one(str
 {
     pci_hide_device(iommu->seg, PCI_BUS(iommu->bdf), PCI_DEVFN2(iommu->bdf));
 
-    if ( map_iommu_mmio_region(iommu) != 0 )
-        goto error_out;
-
-    get_iommu_features(iommu);
-
-    if ( iommu->features.raw )
-        iommuv2_enabled = 1;
-
     if ( allocate_cmd_buffer(iommu) == NULL )
         goto error_out;
 
@@ -1204,6 +1196,23 @@ static bool_t __init amd_sp5100_erratum2
     return 0;
 }
 
+static int __init amd_iommu_prepare_one(struct amd_iommu *iommu)
+{
+    int rc = alloc_ivrs_mappings(iommu->seg);
+
+    if ( !rc )
+        rc = map_iommu_mmio_region(iommu);
+    if ( rc )
+        return rc;
+
+    get_iommu_features(iommu);
+
+    if ( iommu->features.raw )
+        iommuv2_enabled = true;
+
+    return 0;
+}
+
 int __init amd_iommu_init(void)
 {
     struct amd_iommu *iommu;
@@ -1234,7 +1243,7 @@ int __init amd_iommu_init(void)
     radix_tree_init(&ivrs_maps);
     for_each_amd_iommu ( iommu )
     {
-        rc = alloc_ivrs_mappings(iommu->seg);
+        rc = amd_iommu_prepare_one(iommu);
         if ( rc )
             goto error_out;
     }
++++++ 5d41793f-AMD-IOMMU-allow-enabling-without-IRQ.patch ++++++
# Commit 5f569f1ac50eff9dc95ac2e4a617de657d254b52
# Date 2019-07-31 13:19:27 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: allow enabling with IRQ not yet set up

Early enabling (to enter x2APIC mode) requires deferring of the IRQ
setup. Code to actually do that setup in the x2APIC case will get added
subsequently.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -814,7 +814,6 @@ static void amd_iommu_erratum_746_workar
 static void enable_iommu(struct amd_iommu *iommu)
 {
     unsigned long flags;
-    struct irq_desc *desc;
 
     spin_lock_irqsave(&iommu->lock, flags);
 
@@ -834,19 +833,27 @@ static void enable_iommu(struct amd_iomm
     if ( iommu->features.flds.ppr_sup )
         register_iommu_ppr_log_in_mmio_space(iommu);
 
-    desc = irq_to_desc(iommu->msi.irq);
-    spin_lock(&desc->lock);
-    set_msi_affinity(desc, NULL);
-    spin_unlock(&desc->lock);
+    if ( iommu->msi.irq > 0 )
+    {
+        struct irq_desc *desc = irq_to_desc(iommu->msi.irq);
+
+        spin_lock(&desc->lock);
+        set_msi_affinity(desc, NULL);
+        spin_unlock(&desc->lock);
+    }
 
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
 
     set_iommu_ht_flags(iommu);
     set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED);
-    set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED);
 
-    if ( iommu->features.flds.ppr_sup )
-        set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_ENABLED);
+    if ( iommu->msi.irq > 0 )
+    {
+        set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED);
+
+        if ( iommu->features.flds.ppr_sup )
+            set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_ENABLED);
+    }
 
     if ( iommu->features.flds.gt_sup )
         set_iommu_guest_translation_control(iommu, IOMMU_CONTROL_ENABLED);
++++++ 5d417a16-AMD-IOMMU-adjust-IRQ-setup-for-x2APIC.patch ++++++

References: bsc#1135799

# Commit d9e49d1afe2ec45754734845f5c0fbc7effdd3d8
# Date 2019-07-31 13:23:02 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: adjust setup of internal interrupt for x2APIC mode

In order to be able to express all possible destinations we need to make
use of this non-MSI-capability based mechanism. The new IRQ controller
structure can re-use certain MSI functions, though.

For now general and PPR interrupts still share a single vector, IRQ, and
hence handler.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -472,6 +472,44 @@ static hw_irq_controller iommu_maskable_
     .set_affinity = set_msi_affinity,
 };
 
+static void set_x2apic_affinity(struct irq_desc *desc, const cpumask_t *mask)
+{
+    struct amd_iommu *iommu = desc->action->dev_id;
+    unsigned int dest = set_desc_affinity(desc, mask);
+    union amd_iommu_x2apic_control ctrl = {};
+    unsigned long flags;
+
+    if ( dest == BAD_APICID )
+        return;
+
+    msi_compose_msg(desc->arch.vector, NULL, &iommu->msi.msg);
+    iommu->msi.msg.dest32 = dest;
+
+    ctrl.dest_mode = MASK_EXTR(iommu->msi.msg.address_lo,
+                               MSI_ADDR_DESTMODE_MASK);
+    ctrl.int_type = MASK_EXTR(iommu->msi.msg.data,
+                              MSI_DATA_DELIVERY_MODE_MASK);
+    ctrl.vector = desc->arch.vector;
+    ctrl.dest_lo = dest;
+    ctrl.dest_hi = dest >> 24;
+
+    spin_lock_irqsave(&iommu->lock, flags);
+    writeq(ctrl.raw, iommu->mmio_base + IOMMU_XT_INT_CTRL_MMIO_OFFSET);
+    writeq(ctrl.raw, iommu->mmio_base + IOMMU_XT_PPR_INT_CTRL_MMIO_OFFSET);
+    spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static hw_irq_controller iommu_x2apic_type = {
+    .typename     = "IOMMU-x2APIC",
+    .startup      = irq_startup_none,
+    .shutdown     = irq_shutdown_none,
+    .enable       = irq_enable_none,
+    .disable      = irq_disable_none,
+    .ack          = ack_nonmaskable_msi_irq,
+    .end          = end_nonmaskable_msi_irq,
+    .set_affinity = set_x2apic_affinity,
+};
+
 static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
 {
     u16 domain_id, device_id, flags;
@@ -726,8 +764,6 @@ static void iommu_interrupt_handler(int
 static bool_t __init set_iommu_interrupt_handler(struct amd_iommu *iommu)
 {
     int irq, ret;
-    hw_irq_controller *handler;
-    u16 control;
 
     irq = create_irq(NUMA_NO_NODE);
     if ( irq <= 0 )
@@ -747,20 +783,43 @@ static bool_t __init set_iommu_interrupt
                         PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf));
         return 0;
     }
-    control = pci_conf_read16(iommu->seg, PCI_BUS(iommu->bdf),
-                              PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf),
-                              iommu->msi.msi_attrib.pos + PCI_MSI_FLAGS);
-    iommu->msi.msi.nvec = 1;
-    if ( is_mask_bit_support(control) )
-    {
-        iommu->msi.msi_attrib.maskbit = 1;
-        iommu->msi.msi.mpos = msi_mask_bits_reg(iommu->msi.msi_attrib.pos,
-                                                is_64bit_address(control));
-        handler = &iommu_maskable_msi_type;
+
+    if ( iommu->ctrl.int_cap_xt_en )
+    {
+        struct irq_desc *desc = irq_to_desc(irq);
+
+        iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU;
+        iommu->msi.msi_attrib.maskbit = 0;
+        iommu->msi.msi_attrib.is_64 = 1;
+
+        desc->msi_desc = &iommu->msi;
+        desc->handler = &iommu_x2apic_type;
+
+        ret = 0;
     }
     else
-        handler = &iommu_msi_type;
-    ret = __setup_msi_irq(irq_to_desc(irq), &iommu->msi, handler);
+    {
+        hw_irq_controller *handler;
+        u16 control;
+
+        control = pci_conf_read16(iommu->seg, PCI_BUS(iommu->bdf),
+                                  PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf),
+                                  iommu->msi.msi_attrib.pos + PCI_MSI_FLAGS);
+
+        iommu->msi.msi.nvec = 1;
+        if ( is_mask_bit_support(control) )
+        {
+            iommu->msi.msi_attrib.maskbit = 1;
+            iommu->msi.msi.mpos = msi_mask_bits_reg(iommu->msi.msi_attrib.pos,
+                                                    is_64bit_address(control));
+            handler = &iommu_maskable_msi_type;
+        }
+        else
+            handler = &iommu_msi_type;
+
+        ret = __setup_msi_irq(irq_to_desc(irq), &iommu->msi, handler);
+    }
+
     if ( !ret )
         ret = request_irq(irq, 0, iommu_interrupt_handler, "amd_iommu", iommu);
     if ( ret )
@@ -838,8 +897,19 @@ static void enable_iommu(struct amd_iomm
         struct irq_desc *desc = irq_to_desc(iommu->msi.irq);
 
         spin_lock(&desc->lock);
-        set_msi_affinity(desc, NULL);
-        spin_unlock(&desc->lock);
+
+        if ( iommu->ctrl.int_cap_xt_en )
+        {
+            set_x2apic_affinity(desc, NULL);
+            spin_unlock(&desc->lock);
+        }
+        else
+        {
+            set_msi_affinity(desc, NULL);
+            spin_unlock(&desc->lock);
+
+            amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
+        }
     }
 
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
@@ -879,7 +949,9 @@ static void disable_iommu(struct amd_iom
         return;
     }
 
-    amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
+    if ( !iommu->ctrl.int_cap_xt_en )
+        amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
+
     set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_DISABLED);
     set_iommu_event_log_control(iommu, IOMMU_CONTROL_DISABLED);
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -432,6 +432,25 @@ union amd_iommu_ext_features {
     } flds;
 };
 
+/* x2APIC Control Registers */
+#define IOMMU_XT_INT_CTRL_MMIO_OFFSET		0x0170
+#define IOMMU_XT_PPR_INT_CTRL_MMIO_OFFSET	0x0178
+#define IOMMU_XT_GA_INT_CTRL_MMIO_OFFSET	0x0180
+
+union amd_iommu_x2apic_control {
+    uint64_t raw;
+    struct {
+        unsigned int :2;
+        unsigned int dest_mode:1;
+        unsigned int :5;
+        unsigned int dest_lo:24;
+        unsigned int vector:8;
+        unsigned int int_type:1; /* DM in IOMMU spec 3.04 */
+        unsigned int :15;
+        unsigned int dest_hi:8;
+    };
+};
+
 /* Status Register*/
 #define IOMMU_STATUS_MMIO_OFFSET		0x2020
 #define IOMMU_STATUS_EVENT_OVERFLOW_MASK	0x00000001
++++++ 5d417ab6-AMD-IOMMU-enable-x2APIC-mode.patch ++++++

References: bsc#1135799

# Commit 0e8e0a0854a00d81267a5e9c9616a3fbd2729747
# Date 2019-07-31 13:25:42 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: enable x2APIC mode when available

In order for the CPUs to use x2APIC mode, the IOMMU(s) first need to be
switched into suitable state.

The post-AP-bringup IRQ affinity adjustment is done also for the non-
x2APIC case, matching what VT-d does.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -834,6 +834,30 @@ static bool_t __init set_iommu_interrupt
     return 1;
 }
 
+int iov_adjust_irq_affinities(void)
+{
+    const struct amd_iommu *iommu;
+
+    if ( !iommu_enabled )
+        return 0;
+
+    for_each_amd_iommu ( iommu )
+    {
+        struct irq_desc *desc = irq_to_desc(iommu->msi.irq);
+        unsigned long flags;
+
+        spin_lock_irqsave(&desc->lock, flags);
+        if ( iommu->ctrl.int_cap_xt_en )
+            set_x2apic_affinity(desc, NULL);
+        else
+            set_msi_affinity(desc, NULL);
+        spin_unlock_irqrestore(&desc->lock, flags);
+    }
+
+    return 0;
+}
+__initcall(iov_adjust_irq_affinities);
+
 /*
  * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations)
  * Workaround:
@@ -1047,7 +1071,7 @@ static void * __init allocate_ppr_log(st
                                 IOMMU_PPR_LOG_DEFAULT_ENTRIES, "PPR Log");
 }
 
-static int __init amd_iommu_init_one(struct amd_iommu *iommu)
+static int __init amd_iommu_init_one(struct amd_iommu *iommu, bool intr)
 {
     pci_hide_device(iommu->seg, PCI_BUS(iommu->bdf), PCI_DEVFN2(iommu->bdf));
 
@@ -1060,7 +1084,7 @@ static int __init amd_iommu_init_one(str
     if ( iommu->features.flds.ppr_sup && !allocate_ppr_log(iommu) )
         goto error_out;
 
-    if ( !set_iommu_interrupt_handler(iommu) )
+    if ( intr && !set_iommu_interrupt_handler(iommu) )
         goto error_out;
 
     /* To make sure that device_table.buffer has been successfully allocated */
@@ -1089,8 +1113,16 @@ static void __init amd_iommu_init_cleanu
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
     {
         list_del(&iommu->list);
+
+        iommu->ctrl.ga_en = 0;
+        iommu->ctrl.xt_en = 0;
+        iommu->ctrl.int_cap_xt_en = 0;
+
         if ( iommu->enabled )
             disable_iommu(iommu);
+        else if ( iommu->mmio_base )
+            writeq(iommu->ctrl.raw,
+                   iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
 
         deallocate_ring_buffer(&iommu->cmd_buffer);
         deallocate_ring_buffer(&iommu->event_log);
@@ -1292,7 +1324,7 @@ static int __init amd_iommu_prepare_one(
     return 0;
 }
 
-int __init amd_iommu_init(void)
+int __init amd_iommu_prepare(bool xt)
 {
     struct amd_iommu *iommu;
     int rc = -ENODEV;
@@ -1307,9 +1339,14 @@ int __init amd_iommu_init(void)
     if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
         goto error_out;
 
+    /* Have we been here before? */
+    if ( ivhd_type )
+        return 0;
+
     rc = amd_iommu_get_supported_ivhd_type();
     if ( rc < 0 )
         goto error_out;
+    BUG_ON(!rc);
     ivhd_type = rc;
 
     rc = amd_iommu_get_ivrs_dev_entries();
@@ -1325,9 +1362,37 @@ int __init amd_iommu_init(void)
         rc = amd_iommu_prepare_one(iommu);
         if ( rc )
             goto error_out;
+
+        rc = -ENODEV;
+        if ( xt && (!iommu->features.flds.ga_sup || !iommu->features.flds.xt_sup) )
+            goto error_out;
+    }
+
+    for_each_amd_iommu ( iommu )
+    {
+        /* NB: There's no need to actually write these out right here. */
+        iommu->ctrl.ga_en |= xt;
+        iommu->ctrl.xt_en = xt;
+        iommu->ctrl.int_cap_xt_en = xt;
     }
 
     rc = amd_iommu_update_ivrs_mapping_acpi();
+
+ error_out:
+    if ( rc )
+    {
+        amd_iommu_init_cleanup();
+        ivhd_type = 0;
+    }
+
+    return rc;
+}
+
+int __init amd_iommu_init(bool xt)
+{
+    struct amd_iommu *iommu;
+    int rc = amd_iommu_prepare(xt);
+
     if ( rc )
         goto error_out;
 
@@ -1353,7 +1418,12 @@ int __init amd_iommu_init(void)
     /* per iommu initialization  */
     for_each_amd_iommu ( iommu )
     {
-        rc = amd_iommu_init_one(iommu);
+        /*
+         * Setting up of the IOMMU interrupts cannot occur yet at the (very
+         * early) time we get here when enabling x2APIC mode. Suppress it
+         * here, and do it explicitly in amd_iommu_init_interrupt().
+         */
+        rc = amd_iommu_init_one(iommu, !xt);
         if ( rc )
             goto error_out;
     }
@@ -1365,6 +1435,40 @@ error_out:
     return rc;
 }
 
+int __init amd_iommu_init_interrupt(void)
+{
+    struct amd_iommu *iommu;
+    int rc = 0;
+
+    for_each_amd_iommu ( iommu )
+    {
+        struct irq_desc *desc;
+
+        if ( !set_iommu_interrupt_handler(iommu) )
+        {
+            rc = -EIO;
+            break;
+        }
+
+        desc = irq_to_desc(iommu->msi.irq);
+
+        spin_lock(&desc->lock);
+        ASSERT(iommu->ctrl.int_cap_xt_en);
+        set_x2apic_affinity(desc, &cpu_online_map);
+        spin_unlock(&desc->lock);
+
+        set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED);
+
+        if ( iommu->features.flds.ppr_sup )
+            set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_ENABLED);
+    }
+
+    if ( rc )
+        amd_iommu_init_cleanup();
+
+    return rc;
+}
+
 static void invalidate_all_domain_pages(void)
 {
     struct domain *d;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -799,6 +799,35 @@ void *__init amd_iommu_alloc_intremap_ta
     return tb;
 }
 
+bool __init iov_supports_xt(void)
+{
+    unsigned int apic;
+
+    if ( !iommu_enable || !iommu_intremap )
+        return false;
+
+    if ( amd_iommu_prepare(true) )
+        return false;
+
+    for ( apic = 0; apic < nr_ioapics; apic++ )
+    {
+        unsigned int idx = ioapic_id_to_index(IO_APIC_ID(apic));
+
+        if ( idx == MAX_IO_APICS )
+            return false;
+
+        if ( !find_iommu_for_device(ioapic_sbdf[idx].seg,
+                                    ioapic_sbdf[idx].bdf) )
+        {
+            AMD_IOMMU_DEBUG("No IOMMU for IO-APIC %#x (ID %x)\n",
+                            apic, IO_APIC_ID(apic));
+            return false;
+        }
+    }
+
+    return true;
+}
+
 int __init amd_setup_hpet_msi(struct msi_desc *msi_desc)
 {
     spinlock_t *lock;
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -193,7 +193,8 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    if ( amd_iommu_init() != 0 )
+    if ( (init_done ? amd_iommu_init_interrupt()
+                    : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
         return -ENODEV;
@@ -206,6 +207,25 @@ static int __init iov_detect(void)
     return scan_pci_devices();
 }
 
+static int iov_enable_xt(void)
+{
+    int rc;
+
+    if ( system_state >= SYS_STATE_active )
+        return 0;
+
+    if ( (rc = amd_iommu_init(true)) != 0 )
+    {
+        printk("AMD-Vi: Error %d initializing for x2APIC mode\n", rc);
+        /* -ENXIO has special meaning to the caller - convert it. */
+        return rc != -ENXIO ? rc : -ENODATA;
+    }
+
+    init_done = true;
+
+    return 0;
+}
+
 int amd_iommu_alloc_root(struct domain_iommu *hd)
 {
     if ( unlikely(!hd->arch.root_table) )
@@ -596,11 +616,13 @@ static const struct iommu_ops __initcons
     .free_page_table = deallocate_page_table,
     .reassign_device = reassign_device,
     .get_device_group_id = amd_iommu_group_id,
+    .enable_x2apic = iov_enable_xt,
     .update_ire_from_apic = amd_iommu_ioapic_update_ire,
     .update_ire_from_msi = amd_iommu_msi_msg_update_ire,
     .read_apic_from_ire = amd_iommu_read_ioapic_from_ire,
     .read_msi_from_ire = amd_iommu_read_msi_from_ire,
     .setup_hpet_msi = amd_setup_hpet_msi,
+    .adjust_irq_affinities = iov_adjust_irq_affinities,
     .suspend = amd_iommu_suspend,
     .resume = amd_iommu_resume,
     .share_p2m = amd_iommu_share_p2m,
@@ -611,4 +633,5 @@ static const struct iommu_ops __initcons
 static const struct iommu_init_ops __initconstrel _iommu_init_ops = {
     .ops = &_iommu_ops,
     .setup = iov_detect,
+    .supports_x2apic = iov_supports_xt,
 };
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -48,8 +48,11 @@ int amd_iommu_detect_acpi(void);
 void get_iommu_features(struct amd_iommu *iommu);
 
 /* amd-iommu-init functions */
-int amd_iommu_init(void);
+int amd_iommu_prepare(bool xt);
+int amd_iommu_init(bool xt);
+int amd_iommu_init_interrupt(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
+int iov_adjust_irq_affinities(void);
 
 /* mapping functions */
 int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
@@ -95,6 +98,7 @@ void amd_iommu_flush_all_caches(struct a
 struct amd_iommu *find_iommu_for_device(int seg, int bdf);
 
 /* interrupt remapping */
+bool iov_supports_xt(void);
 int amd_iommu_setup_ioapic_remapping(void);
 void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
++++++ 5d417b38-AMD-IOMMU-correct-IRTE-updating.patch ++++++

References: bsc#1135799

# Commit 9e0e225a3aeccd807a8db88ba4669f8ab30ecc99
# Date 2019-07-31 13:27:52 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: correct IRTE updating

Flushing didn't get done along the lines of what the specification says.
Mark entries to be updated as not remapped (which will result in
interrupt requests to get target aborted, but the interrupts should be
masked anyway at that point in time), issue the flush, and only then
write the new entry.

In update_intremap_entry_from_msi_msg() also fold the duplicate initial
lock determination and acquire into just a single instance.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -213,15 +213,13 @@ static void update_intremap_entry(const
             },
         };
 
-        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        ASSERT(!entry.ptr128->full.remap_en);
+        entry.ptr128->raw[1] = irte.raw[1];
         /*
-         * Low half, in particular RemapEn, needs to be cleared first.  See
+         * High half needs to be set before low one (containing RemapEn).  See
          * comment in free_intremap_entry() regarding the choice of barrier.
          */
         smp_wmb();
-        entry.ptr128->raw[1] = irte.raw[1];
-        /* High half needs to be set before low one (containing RemapEn). */
-        smp_wmb();
         ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
     }
     else
@@ -296,6 +294,20 @@ static int update_intremap_entry_from_io
     }
 
     entry = get_intremap_entry(iommu, req_id, offset);
+
+    /* The RemapEn fields match for all formats. */
+    while ( iommu->enabled && entry.ptr32->flds.remap_en )
+    {
+        entry.ptr32->flds.remap_en = false;
+        spin_unlock(lock);
+
+        spin_lock(&iommu->lock);
+        amd_iommu_flush_intremap(iommu, req_id);
+        spin_unlock(&iommu->lock);
+
+        spin_lock(lock);
+    }
+
     if ( fresh )
         /* nothing */;
     else if ( !lo_update )
@@ -325,13 +337,6 @@ static int update_intremap_entry_from_io
 
     spin_unlock_irqrestore(lock, flags);
 
-    if ( iommu->enabled && !fresh )
-    {
-        spin_lock_irqsave(&iommu->lock, flags);
-        amd_iommu_flush_intremap(iommu, req_id);
-        spin_unlock_irqrestore(&iommu->lock, flags);
-    }
-
     set_rte_index(rte, offset);
 
     return 0;
@@ -587,19 +592,27 @@ static int update_intremap_entry_from_ms
     req_id = get_dma_requestor_id(iommu->seg, bdf);
     alias_id = get_intremap_requestor_id(iommu->seg, bdf);
 
+    lock = get_intremap_lock(iommu->seg, req_id);
+    spin_lock_irqsave(lock, flags);
+
     if ( msg == NULL )
     {
-        lock = get_intremap_lock(iommu->seg, req_id);
-        spin_lock_irqsave(lock, flags);
         for ( i = 0; i < nr; ++i )
             free_intremap_entry(iommu, req_id, *remap_index + i);
         spin_unlock_irqrestore(lock, flags);
-        goto done;
-    }
 
-    lock = get_intremap_lock(iommu->seg, req_id);
+        if ( iommu->enabled )
+        {
+            spin_lock_irqsave(&iommu->lock, flags);
+            amd_iommu_flush_intremap(iommu, req_id);
+            if ( alias_id != req_id )
+                amd_iommu_flush_intremap(iommu, alias_id);
+            spin_unlock_irqrestore(&iommu->lock, flags);
+        }
+
+        return 0;
+    }
 
-    spin_lock_irqsave(lock, flags);
     dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
     delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
     vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK;
@@ -623,6 +636,22 @@ static int update_intremap_entry_from_ms
     }
 
     entry = get_intremap_entry(iommu, req_id, offset);
+
+    /* The RemapEn fields match for all formats. */
+    while ( iommu->enabled && entry.ptr32->flds.remap_en )
+    {
+        entry.ptr32->flds.remap_en = false;
+        spin_unlock(lock);
+
+        spin_lock(&iommu->lock);
+        amd_iommu_flush_intremap(iommu, req_id);
+        if ( alias_id != req_id )
+            amd_iommu_flush_intremap(iommu, alias_id);
+        spin_unlock(&iommu->lock);
+
+        spin_lock(lock);
+    }
+
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
@@ -642,16 +671,6 @@ static int update_intremap_entry_from_ms
                get_ivrs_mappings(iommu->seg)[alias_id].intremap_table);
     }
 
-done:
-    if ( iommu->enabled )
-    {
-        spin_lock_irqsave(&iommu->lock, flags);
-        amd_iommu_flush_intremap(iommu, req_id);
-        if ( alias_id != req_id )
-            amd_iommu_flush_intremap(iommu, alias_id);
-        spin_unlock_irqrestore(&iommu->lock, flags);
-    }
-
     return 0;
 }
 
++++++ 5d417b6a-AMD-IOMMU-dont-needlessly-log-headers.patch ++++++

References: bsc#1135799

# Commit b28ae8b23cbc40202e6de3017f11bdacac9d9590
# Date 2019-07-31 13:28:42 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: don't needlessly log headers when dumping IRTs

Log SBDF headers only when there are actual IRTEs to log. This is
particularly important for the total volume of output when the ACPI
tables describe far more than just the existing devices. On my Rome
system so far there was one line for every function of every device on
all 256 buses of segment 0, with extremely few exceptions (like the
IOMMUs themselves).

Also only log one of the "per-device" or "shared" overall headers.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -883,7 +883,8 @@ int __init amd_setup_hpet_msi(struct msi
 }
 
 static void dump_intremap_table(const struct amd_iommu *iommu,
-                                union irte_cptr tbl)
+                                union irte_cptr tbl,
+                                const struct ivrs_mappings *ivrs_mapping)
 {
     unsigned int count;
 
@@ -892,19 +893,25 @@ static void dump_intremap_table(const st
 
     for ( count = 0; count < INTREMAP_ENTRIES; count++ )
     {
-        if ( iommu->ctrl.ga_en )
-        {
-            if ( !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1] )
+        if ( iommu->ctrl.ga_en
+             ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
+             : !tbl.ptr32[count].raw )
                 continue;
+
+        if ( ivrs_mapping )
+        {
+            printk("  %04x:%02x:%02x:%u:\n", iommu->seg,
+                   PCI_BUS(ivrs_mapping->dte_requestor_id),
+                   PCI_SLOT(ivrs_mapping->dte_requestor_id),
+                   PCI_FUNC(ivrs_mapping->dte_requestor_id));
+            ivrs_mapping = NULL;
+        }
+
+        if ( iommu->ctrl.ga_en )
             printk("    IRTE[%03x] %016lx_%016lx\n",
                    count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
-        }
         else
-        {
-            if ( !tbl.ptr32[count].raw )
-                continue;
             printk("    IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
-        }
     }
 }
 
@@ -916,13 +923,8 @@ static int dump_intremap_mapping(const s
     if ( !ivrs_mapping )
         return 0;
 
-    printk("  %04x:%02x:%02x:%u:\n", iommu->seg,
-           PCI_BUS(ivrs_mapping->dte_requestor_id),
-           PCI_SLOT(ivrs_mapping->dte_requestor_id),
-           PCI_FUNC(ivrs_mapping->dte_requestor_id));
-
     spin_lock_irqsave(&(ivrs_mapping->intremap_lock), flags);
-    dump_intremap_table(iommu, ivrs_mapping->intremap_table);
+    dump_intremap_table(iommu, ivrs_mapping->intremap_table, ivrs_mapping);
     spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags);
 
     process_pending_softirqs();
@@ -932,17 +934,22 @@ static int dump_intremap_mapping(const s
 
 static void dump_intremap_tables(unsigned char key)
 {
-    unsigned long flags;
-
-    printk("--- Dumping Per-dev IOMMU Interrupt Remapping Table ---\n");
+    if ( !shared_intremap_table )
+    {
+        printk("--- Dumping Per-dev IOMMU Interrupt Remapping Table ---\n");
 
-    iterate_ivrs_entries(dump_intremap_mapping);
+        iterate_ivrs_entries(dump_intremap_mapping);
+    }
+    else
+    {
+        unsigned long flags;
 
-    printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n");
+        printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n");
 
-    spin_lock_irqsave(&shared_intremap_lock, flags);
-    dump_intremap_table(list_first_entry(&amd_iommu_head, struct amd_iommu,
-                                         list),
-                        shared_intremap_table);
-    spin_unlock_irqrestore(&shared_intremap_lock, flags);
+        spin_lock_irqsave(&shared_intremap_lock, flags);
+        dump_intremap_table(list_first_entry(&amd_iommu_head, struct amd_iommu,
+                                             list),
+                            shared_intremap_table, NULL);
+        spin_unlock_irqrestore(&shared_intremap_lock, flags);
+    }
 }
++++++ 5d4a9d25-AMD-IOMMU-drop-not-found-message.patch ++++++

References: bsc#1135799

# Commit ef04aa69f06d38b8852d6cca02d13aaf3deaa74c
# Date 2019-08-07 10:43:01 +0100
# Author Andrew Cooper 
# Committer Andrew Cooper 
passthrough/amd: Drop "IOMMU not found" message

Since c/s 9fa94e10585 "x86/ACPI: also parse AMD IOMMU tables early", this
function is unconditionally called in all cases where a DMAR ACPI table
doesn't exist.

As a consequnce, "AMD-Vi: IOMMU not found!" is printed in all cases where an
IOMMU isn't present, even on non-AMD systems.  Drop the message - it isn't
terribly interesting anyway, and is now misleading is a number of common
cases.

Signed-off-by: Andrew Cooper 
Acked-by: Jan Beulich 
Acked-by: Brian Woods 

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -178,7 +178,6 @@ int __init acpi_ivrs_init(void)
 
     if ( (amd_iommu_detect_acpi() !=0) || (iommu_found() == 0) )
     {
-        printk("AMD-Vi: IOMMU not found!\n");
         iommu_intremap = 0;
         return -ENODEV;
     }
++++++ 5d67ceaf-x86-properly-gate-PKU-clearing.patch ++++++
# Commit 41c7700a00011ad08be3c9d71126b67e08e58ac3
# Date 2019-08-29 15:10:07 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86: properly gate clearing of PKU feature

setup_clear_cpu_cap() is __init and hence may not be called post-boot.
Note that opt_pku nevertheless is not getting __initdata added - see
e.g. commit 43fa95ae6a ("mm: make opt_bootscrub non-init").

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -464,7 +464,7 @@ void identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_init(c);
 
 
-   	if ( !opt_pku )
+   	if (c == &boot_cpu_data && !opt_pku)
 		setup_clear_cpu_cap(X86_FEATURE_PKU);
 
 	/*
++++++ 5d70bfba-x86-shadow-dont-enable-with-too-small-allocation.patch ++++++

References: bsc#1145240

# Commit 8b25551baa3307af0aa1ef8f7f43403f01c2c5d7
# Date 2019-09-05 09:56:42 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/shadow: don't enable shadow mode with too small a shadow allocation (part 2)

Commit 2634b997af ("x86/shadow: don't enable shadow mode with too small
a shadow allocation") was incomplete: The adjustment done there to
shadow_enable() is also needed in shadow_one_bit_enable(). The (new)
problem report was (apparently) a failed PV guest migration followed by
another migration attempt for that same guest. Disabling log-dirty mode
after the first one had left a couple of shadow pages allocated (perhaps
something that also wants fixing), and hence the second enabling of
log-dirty mode wouldn't have allocated anything further.

Reported-by: James Wang 
Signed-off-by: Jan Beulich 
Acked-by: Tim Deegan 

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2984,7 +2984,8 @@ static int shadow_one_bit_enable(struct
 
     mode |= PG_SH_enable;
 
-    if ( d->arch.paging.shadow.total_pages == 0 )
+    if ( d->arch.paging.shadow.total_pages <
+         sh_min_allocation(d) + d->arch.paging.shadow.p2m_pages )
     {
         /* Init the shadow memory allocation if the user hasn't done so */
         if ( shadow_set_allocation(d, 1, NULL) != 0 )
++++++ 5d779811-x86-fix-CPUID7-0-eax-levelling-MSR.patch ++++++
# Commit b50d78d0eaffb43d5f5ceeda55fa22c11f47d01b
# Date 2019-09-10 13:33:21 +0100
# Author Andrew Cooper 
# Committer Andrew Cooper 
x86/cpuid: Fix handling of the CPUID.7[0].eax levelling MSR

7a0 is an integer field, not a mask - taking the logical and of the hardware
and policy values results in nonsense.  Instead, take the policy value
directly.

Signed-off-by: Andrew Cooper 
Reviewed-by: Roger Pau Monné 
Reviewed-by: Jan Beulich 

--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -217,11 +217,15 @@ static int update_domain_cpuid_info(stru
         if ( is_pv_domain(d) && ((levelling_caps & LCAP_7ab0) == LCAP_7ab0) )
         {
             uint64_t mask = cpuidmask_defaults._7ab0;
-            uint32_t eax = ctl->eax;
-            uint32_t ebx = p->feat._7b0;
 
+            /*
+             * Leaf 7[0].eax is max_subleaf, not a feature mask.  Take it
+             * wholesale from the policy, but clamp the features in 7[0].ebx
+             * per usual.
+             */
             if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
-                mask &= ((uint64_t)eax << 32) | ebx;
+                mask = (((uint64_t)p->feat.max_subleaf << 32) |
+                        ((uint32_t)mask & p->feat._7b0));
 
             d->arch.pv.cpuidmasks->_7ab0 = mask;
         }
++++++ 5d77b40f-fix-hvm_all_ioreq_servers_add_vcpu-cleanup.patch ++++++
# Commit 215f2576b0ac1bc18f3ff74e34f0d8379bda9040
# Date 2019-09-10 16:32:47 +0200
# Author Roger Pau Monné 
# Committer Jan Beulich 
ioreq: fix hvm_all_ioreq_servers_add_vcpu fail path cleanup

The loop in FOR_EACH_IOREQ_SERVER is backwards hence the cleanup on
failure needs to be done forwards.

Fixes: 97a5a3e30161 ('x86/hvm/ioreq: maintain an array of ioreq servers rather than a list')
Signed-off-by: Roger Pau Monné 
Reviewed-by: Paul Durrant 

--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -1202,7 +1202,7 @@ int hvm_all_ioreq_servers_add_vcpu(struc
     return 0;
 
  fail:
-    while ( id-- != 0 )
+    while ( ++id != MAX_NR_IOREQ_SERVERS )
     {
         s = GET_IOREQ_SERVER(d, id);
 
++++++ 5d80e7c0-AMD-IOMMU-free-shared-IRT-once.patch ++++++

References: bsc#1135799

# Commit 2ec0c0e13efffa0cb5ad2d98381dacb638c7e786
# Date 2019-09-17 16:03:44 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: don't free shared IRT multiple times

Calling amd_iommu_free_intremap_table() for every IVRS entry is correct
only in per-device-IRT mode. Use a NULL 2nd argument to indicate that
the shared table should be freed, and call the function exactly once in
shared mode.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1109,6 +1109,15 @@ static void __init amd_iommu_init_cleanu
 {
     struct amd_iommu *iommu, *next;
 
+    /* free interrupt remapping table */
+    if ( amd_iommu_perdev_intremap )
+        iterate_ivrs_entries(amd_iommu_free_intremap_table);
+    else if ( shared_intremap_table )
+        amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
+                                                       struct amd_iommu,
+                                                       list),
+                                      NULL);
+
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
     {
@@ -1131,9 +1140,6 @@ static void __init amd_iommu_init_cleanu
         xfree(iommu);
     }
 
-    /* free interrupt remapping table */
-    iterate_ivrs_entries(amd_iommu_free_intremap_table);
-
     /* free device table */
     deallocate_device_table(&device_table);
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -793,14 +793,23 @@ void amd_iommu_read_msi_from_ire(
 int __init amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
-    void *tb = ivrs_mapping->intremap_table;
+    void **tblp;
 
-    XFREE(ivrs_mapping->intremap_inuse);
+    if ( ivrs_mapping )
+    {
+        XFREE(ivrs_mapping->intremap_inuse);
+        tblp = &ivrs_mapping->intremap_table;
+    }
+    else
+    {
+        XFREE(shared_intremap_inuse);
+        tblp = &shared_intremap_table;
+    }
 
-    if ( tb )
+    if ( *tblp )
     {
-        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
-        ivrs_mapping->intremap_table = NULL;
+        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        *tblp = NULL;
     }
 
     return 0;
++++++ 5d80e80d-AMD-IOMMU-valid-flag-for-IVRS-mappings.patch ++++++

References: bsc#1135799

# Commit 34c0dcf84ff6347424808d2740398c892b8ff8e4
# Date 2019-09-17 16:05:01 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: introduce a "valid" flag for IVRS mappings

For us to no longer blindly allocate interrupt remapping tables for
everything the ACPI tables name, we can't use struct ivrs_mappings'
intremap_table field anymore to also have the meaning of "this entry
is valid". Add a separate boolean field instead.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -88,6 +88,8 @@ static void __init add_ivrs_mapping_entr
          }
     }
 
+    ivrs_mappings[alias_id].valid = true;
+
     /* Assign IOMMU hardware. */
     ivrs_mappings[bdf].iommu = iommu;
 }
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1247,7 +1247,6 @@ static int __init amd_iommu_setup_device
     u16 seg, struct ivrs_mappings *ivrs_mappings)
 {
     unsigned int bdf;
-    void *intr_tb, *dte;
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
@@ -1267,16 +1266,17 @@ static int __init amd_iommu_setup_device
     /* Add device table entries */
     for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
     {
-        intr_tb = ivrs_mappings[bdf].intremap_table;
-
-        if ( intr_tb )
+        if ( ivrs_mappings[bdf].valid )
         {
+            void *dte;
+
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
             amd_iommu_set_intremap_table(
-                dte, (u64)virt_to_maddr(intr_tb), iommu_intremap);
+                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -69,8 +69,8 @@ struct amd_iommu *find_iommu_for_device(
  * table and I/O page table respectively. Such devices will have
  * both alias entry and select entry in IVRS structure.
  *
- * Return original device id, if device has valid interrupt remapping
- * table setup for both select entry and alias entry.
+ * Return original device id if both the specific entry and the alias entry
+ * have been marked valid.
  */
 int get_dma_requestor_id(u16 seg, u16 bdf)
 {
@@ -79,8 +79,7 @@ int get_dma_requestor_id(u16 seg, u16 bd
 
     BUG_ON ( bdf >= ivrs_bdf_entries );
     req_id = ivrs_mappings[bdf].dte_requestor_id;
-    if ( (ivrs_mappings[bdf].intremap_table != NULL) &&
-         (ivrs_mappings[req_id].intremap_table != NULL) )
+    if ( ivrs_mappings[bdf].valid && ivrs_mappings[req_id].valid )
         req_id = bdf;
 
     return req_id;
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -116,6 +116,7 @@ struct ivrs_mappings {
     u8 unity_map_enable;
     u8 write_permission;
     u8 read_permission;
+    bool valid;
     unsigned long addr_range_start;
     unsigned long addr_range_length;
     struct amd_iommu *iommu;
++++++ 5d80e82e-AMD-IOMMU-alloc_intremap_table-callers-handle-errors.patch ++++++

References: bsc#1135799

# Commit 076c34d12a341f3915a2d8021752e0641df07496
# Date 2019-09-17 16:05:34 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: let callers of amd_iommu_alloc_intremap_table() handle errors

Additional users of the function will want to handle errors more
gracefully. Remove the BUG_ON()s and make the current caller panic()
instead.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -86,6 +86,10 @@ static void __init add_ivrs_mapping_entr
              ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
          }
+
+         if ( !ivrs_mappings[alias_id].intremap_table )
+             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
+                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
     }
 
     ivrs_mappings[alias_id].valid = true;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -818,12 +818,22 @@ int __init amd_iommu_free_intremap_table
 void *__init amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
-    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+    unsigned int order = intremap_table_order(iommu);
+    void *tb = __alloc_amd_iommu_tables(order);
+
+    if ( tb )
+    {
+        *inuse_map = xzalloc_array(unsigned long,
+                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        if ( *inuse_map )
+            memset(tb, 0, PAGE_SIZE << order);
+        else
+        {
+            __free_amd_iommu_tables(tb, order);
+            tb = NULL;
+        }
+    }
 
-    BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
-    *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
-    BUG_ON(*inuse_map == NULL);
     return tb;
 }
 
++++++ 5d80e857-x86-PCI-read-MSI-X-table-entry-count-early.patch ++++++

References: bsc#1135799

# Commit 27ddc58d42a7848dbe60ba9f127ddd052906d487
# Date 2019-09-17 16:06:15 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/PCI: read MSI-X table entry count early

Rather than doing this every time we set up interrupts for a device
anew (and then in two distinct places) fill this invariant field
right after allocating struct arch_msix.

While at it also obtain the MSI-X capability structure position just
once, in msix_capability_init(), rather than in each caller.

Furthermore take the opportunity and eliminate the multi_msix_capable()
alias of msix_table_size().

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Acked-by: Andrew Cooper 

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -844,10 +844,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
-                                unsigned int pos,
                                 struct msi_info *msi,
-                                struct msi_desc **desc,
-                                unsigned int nr_entries)
+                                struct msi_desc **desc)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
@@ -861,6 +859,11 @@ static int msix_capability_init(struct p
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
     bool maskall = msix->host_maskall;
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
 
     ASSERT(pcidevs_locked());
 
@@ -936,10 +939,9 @@ static int msix_capability_init(struct p
         u64 pba_paddr;
         u32 pba_offset;
 
-        msix->nr_entries = nr_entries;
         msix->table.first = PFN_DOWN(table_paddr);
         msix->table.last = PFN_DOWN(table_paddr +
-                                    nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+                                    msix->nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first,
                                         msix->table.last));
 
@@ -952,7 +954,7 @@ static int msix_capability_init(struct p
 
         msix->pba.first = PFN_DOWN(pba_paddr);
         msix->pba.last = PFN_DOWN(pba_paddr +
-                                  BITS_TO_LONGS(nr_entries) - 1);
+                                  BITS_TO_LONGS(msix->nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                         msix->pba.last));
     }
@@ -1024,7 +1026,6 @@ static int msix_capability_init(struct p
             /* XXX How to deal with existing mappings? */
         }
     }
-    WARN_ON(msix->nr_entries != nr_entries);
     WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
     ++msix->used_entries;
 
@@ -1118,23 +1119,17 @@ static void __pci_disable_msi(struct msi
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int pos, nr_entries;
     struct pci_dev *pdev;
-    u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
     u8 func = PCI_FUNC(msi->devfn);
     struct msi_desc *old_desc;
 
     ASSERT(pcidevs_locked());
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
-    if ( !pdev || !pos )
+    if ( !pdev || !pdev->msix )
         return -ENODEV;
 
-    control = pci_conf_read16(msi->seg, msi->bus, slot, func,
-                              msix_control_reg(pos));
-    nr_entries = multi_msix_capable(control);
-    if ( msi->entry_nr >= nr_entries )
+    if ( msi->entry_nr >= pdev->msix->nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
@@ -1153,7 +1148,7 @@ static int __pci_enable_msix(struct msi_
         __pci_disable_msi(old_desc);
     }
 
-    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
+    return msix_capability_init(pdev, msi, desc);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1213,16 +1208,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
 {
     int rc;
     struct pci_dev *pdev;
-    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
-    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
-                                           PCI_CAP_ID_MSIX);
 
     if ( !use_msi )
         return 0;
 
-    if ( !pos )
-        return -ENODEV;
-
     pcidevs_lock();
     pdev = pci_get_pdev(seg, bus, devfn);
     if ( !pdev )
@@ -1235,13 +1224,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
         rc = 0;
     }
     else
-    {
-        u16 control = pci_conf_read16(seg, bus, slot, func,
-                                      msix_control_reg(pos));
-
-        rc = msix_capability_init(pdev, pos, NULL, NULL,
-                                  multi_msix_capable(control));
-    }
+        rc = msix_capability_init(pdev, NULL, NULL);
     pcidevs_unlock();
 
     return rc;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -330,6 +330,7 @@ static void apply_quirks(struct pci_dev
 static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
 {
     struct pci_dev *pdev;
+    unsigned int pos;
 
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
         if ( pdev->bus == bus && pdev->devfn == devfn )
@@ -345,10 +346,12 @@ static struct pci_dev *alloc_pdev(struct
     pdev->domain = NULL;
     INIT_LIST_HEAD(&pdev->msi_list);
 
-    if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                             PCI_CAP_ID_MSIX) )
+    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                              PCI_CAP_ID_MSIX);
+    if ( pos )
     {
         struct arch_msix *msix = xzalloc(struct arch_msix);
+        uint16_t ctrl;
 
         if ( !msix )
         {
@@ -356,6 +359,11 @@ static struct pci_dev *alloc_pdev(struct
             return NULL;
         }
         spin_lock_init(&msix->table_lock);
+
+        ctrl = pci_conf_read16(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                               msix_control_reg(pos));
+        msix->nr_entries = msix_table_size(ctrl);
+
         pdev->msix = msix;
     }
 
@@ -364,7 +372,6 @@ static struct pci_dev *alloc_pdev(struct
     /* update bus2bridge */
     switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
     {
-        int pos;
         u16 cap;
         u8 sec_bus, sub_bus;
 
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -171,7 +171,6 @@ int msi_free_irq(struct msi_desc *entry)
 #define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
 #define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
 #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
 #define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
 #define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
 
++++++ 5d80ea13-vpci-honor-read-only-devices.patch ++++++
# Commit 79f9ba78380fb3f4bf509e5c726c6cdd76e00c4f
# Date 2019-09-17 16:13:39 +0200
# Author Roger Pau Monné 
# Committer Jan Beulich 
vpci: honor read-only devices

Don't allow the hardware domain write access the PCI config space of
devices marked as read-only.

Signed-off-by: Roger Pau Monné 
Reviewed-by: Jan Beulich 

--- a/tools/tests/vpci/emul.h
+++ b/tools/tests/vpci/emul.h
@@ -92,6 +92,9 @@ typedef union {
 #define xfree(p) free(p)
 
 #define pci_get_pdev_by_domain(...) &test_pdev
+#define pci_get_ro_map(...) NULL
+
+#define test_bit(...) false
 
 /* Dummy native helpers. Writes are ignored, reads return 1's. */
 #define pci_conf_read8(...)     0xff
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -421,6 +421,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigne
     const struct pci_dev *pdev;
     const struct vpci_register *r;
     unsigned int data_offset = 0;
+    const unsigned long *ro_map = pci_get_ro_map(sbdf.seg);
 
     if ( !size )
     {
@@ -428,6 +429,10 @@ void vpci_write(pci_sbdf_t sbdf, unsigne
         return;
     }
 
+    if ( ro_map && test_bit(sbdf.bdf, ro_map) )
+        /* Ignore writes to read-only devices. */
+        return;
+
     /*
      * Find the PCI dev matching the address.
      * Passthrough everything that's not trapped.
++++++ 5d89d8d9-libxc-x86-avoid-overflow-in-CPUID-APIC-ID.patch ++++++

References: bsc#1137717

# Commit df29d03f1d97bdde1bc0cea8ef8538d4f524b3ec
# Date 2019-09-24 10:50:33 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
libxc/x86: avoid certain overflows in CPUID APIC ID adjustments

Recent AMD processors may report up to 128 logical processors in CPUID
leaf 1. Doubling this value produces 0 (which OSes sincerely dislike),
as the respective field is only 8 bits wide. Suppress doubling the value
(and its leaf 0x80000008 counterpart) in such a case.

Note that while there's a similar overflow in intel_xc_cpuid_policy(),
that one is being left alone for now.

Note further that while it was considered to suppress the multiplication
by 2 altogether if the host topology already provides at least one bit
of thread ID within APIC IDs, it was decided to avoid more change here
than really needed at this point.

Also zap leaf 4 (and at the same time leaf 2) EDX output for AMD, as it
should have been from the beginning.

Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 

# Commit c9c7ac508b3f65f7d5f9685893096a1b22d8b176
# Date 2019-09-25 15:50:58 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
libxc/x86: correct overflow avoidance check in AMD CPUID handling

Commit df29d03f1d ("libxc/x86: avoid certain overflows in CPUID APIC ID
adjustments" introduced a one bit too narrow mask when checking whether
multiplying by 1 (in particular in leaf 1) would result in overflow.

Reported-by: Andrew Cooper 
Signed-off-by: Jan Beulich 
Reviewed-by: Andrew Cooper 

--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -397,7 +397,7 @@ static void amd_xc_cpuid_policy(const st
     {
     case 0x00000002:
     case 0x00000004:
-        regs[0] = regs[1] = regs[2] = 0;
+        regs[0] = regs[1] = regs[2] = regs[3] = 0;
         break;
 
     case 0x80000000:
@@ -407,11 +407,20 @@ static void amd_xc_cpuid_policy(const st
 
     case 0x80000008:
         /*
-         * ECX[15:12] is ApicIdCoreSize: ECX[7:0] is NumberOfCores (minus one).
-         * Update to reflect vLAPIC_ID = vCPU_ID * 2.
+         * ECX[15:12] is ApicIdCoreSize.
+         * ECX[7:0] is NumberOfCores (minus one).
+         * Update to reflect vLAPIC_ID = vCPU_ID * 2.  But make sure to avoid
+         * - overflow,
+         * - going out of sync with leaf 1 EBX[23:16],
+         * - incrementing ApicIdCoreSize when it's zero (which changes the
+         *   meaning of bits 7:0).
          */
-        regs[2] = ((regs[2] + (1u << 12)) & 0xf000u) |
-                  ((regs[2] & 0xffu) << 1) | 1u;
+        if ( (regs[2] & 0xffu) < 0x7fu )
+        {
+            if ( (regs[2] & 0xf000u) && (regs[2] & 0xf000u) != 0xf000u )
+                regs[2] = ((regs[2] + 0x1000u) & 0xf000u) | (regs[2] & 0xffu);
+            regs[2] = (regs[2] & 0xf000u) | ((regs[2] & 0x7fu) << 1) | 1u;
+        }
         break;
 
     case 0x8000000a: {
@@ -490,9 +499,13 @@ static void xc_cpuid_hvm_policy(const st
     case 0x00000001:
         /*
          * EBX[23:16] is Maximum Logical Processors Per Package.
-         * Update to reflect vLAPIC_ID = vCPU_ID * 2.
+         * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid
+         * overflow.
          */
-        regs[1] = (regs[1] & 0x0000ffffu) | ((regs[1] & 0x007f0000u) << 1);
+        if ( !(regs[1] & 0x00800000u) )
+            regs[1] = (regs[1] & 0x0000ffffu) | ((regs[1] & 0x007f0000u) << 1);
+        else
+            regs[1] &= 0x00ffffffu;
 
         regs[2] = info->featureset[featureword_of(X86_FEATURE_SSE3)];
         regs[3] = (info->featureset[featureword_of(X86_FEATURE_FPU)] |
++++++ 5d8b715f-ACPI-cpuidle-bump-max-num-of-states.patch ++++++
# Commit ff22a91b4c45f9310d0ec0d7ee070d84a373dd87
# Date 2019-09-25 15:53:35 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
ACPI/cpuidle: bump maximum number of power states we support

Commit 4c6cd64519 ("mwait_idle: Skylake Client Support") added a table
with 8 entries, which - together with C0 - rendered the current limit
too low. It should have been accompanied by an increase of the constant;
do this now. Don't bump by too much though, as there are a number of on-
stack arrays which are dimensioned by this constant.

Signed-off-by: Jan Beulich 
Reviewed-by: Wei Liu 

--- a/xen/include/xen/cpuidle.h
+++ b/xen/include/xen/cpuidle.h
@@ -29,7 +29,7 @@
 #include 
 #include 
 
-#define ACPI_PROCESSOR_MAX_POWER        8
+#define ACPI_PROCESSOR_MAX_POWER        12
 #define CPUIDLE_NAME_LEN                16
 
 #define ACPI_CSTATE_EM_NONE     0
++++++ 5d8b72e5-AMD-IOMMU-dont-blindly-alloc-intremap-tables.patch ++++++

References: bsc#1135799

# Commit d7cfeb7c13ed60be949714cd4befa7edb3211c9b
# Date 2019-09-25 16:00:05 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: don't blindly allocate interrupt remapping tables

ACPI tables are free to list far more device coordinates than there are
actual devices. By delaying the table allocations for most cases, and
doing them only when an actual device is known to be present at a given
position, overall memory used for the tables goes down from over 500k
pages to just over 1k (on my system having such ACPI table contents).

Tables continue to get allocated right away for special entries
(IO-APIC, HPET) as well as for alias IDs. While in the former case
that's simply because there may not be any device at a given position,
in the latter case this is to avoid having to introduce ref-counting of
table usage.

The change involves invoking
iterate_ivrs_mappings(amd_iommu_setup_device_table) a second time,
because the function now wants to be able to find PCI devices, which
isn't possible yet when IOMMU setup happens very early during x2APIC
mode setup. In this context amd_iommu_init_interrupt() gets renamed as
well.

The logic adjusting a DTE's interrupt remapping attributes also gets
changed, such that the lack of an IRT would result in target aborted
rather than non-remapped interrupts (should any occur).

Note that for now phantom functions get separate IRTs allocated, as was
the case before.

Signed-off-by: Jan Beulich 
Reviewed-by: Paul Durrant 
Acked-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -53,7 +53,8 @@ union acpi_ivhd_device {
 };
 
 static void __init add_ivrs_mapping_entry(
-    u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu)
+    uint16_t bdf, uint16_t alias_id, uint8_t flags, bool alloc_irt,
+    struct amd_iommu *iommu)
 {
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
 
@@ -69,27 +70,32 @@ static void __init add_ivrs_mapping_entr
     if ( iommu->bdf == bdf )
         return;
 
-    if ( !ivrs_mappings[alias_id].intremap_table )
+    /* Allocate interrupt remapping table if needed. */
+    if ( iommu_intremap && !ivrs_mappings[alias_id].intremap_table )
     {
-         /* allocate per-device interrupt remapping table */
-         if ( amd_iommu_perdev_intremap )
-             ivrs_mappings[alias_id].intremap_table =
-                 amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &ivrs_mappings[alias_id].intremap_inuse);
-         else
-         {
-             if ( shared_intremap_table == NULL  )
-                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &shared_intremap_inuse);
-             ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
-             ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
-         }
-
-         if ( !ivrs_mappings[alias_id].intremap_table )
-             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
-                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
+        if ( !amd_iommu_perdev_intremap )
+        {
+            if ( !shared_intremap_table )
+                shared_intremap_table = amd_iommu_alloc_intremap_table(
+                    iommu, &shared_intremap_inuse);
+
+            if ( !shared_intremap_table )
+                panic("No memory for shared IRT\n");
+
+            ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
+            ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
+        }
+        else if ( alloc_irt )
+        {
+            ivrs_mappings[alias_id].intremap_table =
+                amd_iommu_alloc_intremap_table(
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+
+            if ( !ivrs_mappings[alias_id].intremap_table )
+                panic("No memory for %04x:%02x:%02x.%u's IRT\n",
+                      iommu->seg, PCI_BUS(alias_id), PCI_SLOT(alias_id),
+                      PCI_FUNC(alias_id));
+        }
     }
 
     ivrs_mappings[alias_id].valid = true;
@@ -433,7 +439,8 @@ static u16 __init parse_ivhd_device_sele
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, false,
+                           iommu);
 
     return sizeof(*select);
 }
@@ -479,7 +486,7 @@ static u16 __init parse_ivhd_device_rang
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->start.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -513,7 +520,8 @@ static u16 __init parse_ivhd_device_alia
 
     AMD_IOMMU_DEBUG(" Dev_Id Alias: %#x\n", alias_id);
 
-    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, true,
+                           iommu);
 
     return dev_length;
 }
@@ -568,7 +576,7 @@ static u16 __init parse_ivhd_device_alia
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting,
-                               iommu);
+                               true, iommu);
 
     return dev_length;
 }
@@ -593,7 +601,7 @@ static u16 __init parse_ivhd_device_exte
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, false, iommu);
 
     return dev_length;
 }
@@ -640,7 +648,7 @@ static u16 __init parse_ivhd_device_exte
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->extended.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -733,7 +741,8 @@ static u16 __init parse_ivhd_device_spec
     AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n",
                     seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf),
                     special->variety, special->handle);
-    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, true,
+                           iommu);
 
     switch ( special->variety )
     {
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -30,6 +30,7 @@
 #include 
 
 static int __initdata nr_amd_iommus;
+static bool __initdata pci_init;
 
 static void do_amd_iommu_irq(unsigned long data);
 static DECLARE_SOFTIRQ_TASKLET(amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
@@ -1250,17 +1251,20 @@ static int __init amd_iommu_setup_device
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
-    /* allocate 'device table' on a 4K boundary */
-    device_table.alloc_size = PAGE_SIZE <<
-                              get_order_from_bytes(
-                              PAGE_ALIGN(ivrs_bdf_entries *
-                              IOMMU_DEV_TABLE_ENTRY_SIZE));
-    device_table.entries = device_table.alloc_size /
-                           IOMMU_DEV_TABLE_ENTRY_SIZE;
-
-    device_table.buffer = allocate_buffer(device_table.alloc_size,
-                                          "Device Table");
-    if  ( device_table.buffer == NULL )
+    if ( !device_table.buffer )
+    {
+        /* allocate 'device table' on a 4K boundary */
+        device_table.alloc_size = PAGE_SIZE <<
+                                  get_order_from_bytes(
+                                  PAGE_ALIGN(ivrs_bdf_entries *
+                                  IOMMU_DEV_TABLE_ENTRY_SIZE));
+        device_table.entries = device_table.alloc_size /
+                               IOMMU_DEV_TABLE_ENTRY_SIZE;
+
+        device_table.buffer = allocate_buffer(device_table.alloc_size,
+                                              "Device Table");
+    }
+    if ( !device_table.buffer )
         return -ENOMEM;
 
     /* Add device table entries */
@@ -1269,13 +1273,46 @@ static int __init amd_iommu_setup_device
         if ( ivrs_mappings[bdf].valid )
         {
             void *dte;
+            const struct pci_dev *pdev = NULL;
 
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
+            if ( iommu_intremap &&
+                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
+                 !ivrs_mappings[bdf].intremap_table )
+            {
+                if ( !pci_init )
+                    continue;
+                pcidevs_lock();
+                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+                pcidevs_unlock();
+            }
+
+            if ( pdev )
+            {
+                unsigned int req_id = bdf;
+
+                do {
+                    ivrs_mappings[req_id].intremap_table =
+                        amd_iommu_alloc_intremap_table(
+                            ivrs_mappings[bdf].iommu,
+                            &ivrs_mappings[req_id].intremap_inuse);
+                    if ( !ivrs_mappings[req_id].intremap_table )
+                        return -ENOMEM;
+
+                    if ( !pdev->phantom_stride )
+                        break;
+                    req_id += pdev->phantom_stride;
+                } while ( PCI_SLOT(req_id) == PCI_SLOT(pdev->devfn) );
+            }
+
             amd_iommu_set_intremap_table(
-                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                dte,
+                ivrs_mappings[bdf].intremap_table
+                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+                : 0,
                 iommu_intremap);
         }
     }
@@ -1408,7 +1445,8 @@ int __init amd_iommu_init(bool xt)
     if ( rc )
         goto error_out;
 
-    /* allocate and initialize a global device table shared by all iommus */
+    /* Allocate and initialize device table(s). */
+    pci_init = !xt;
     rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
     if ( rc )
         goto error_out;
@@ -1427,7 +1465,7 @@ int __init amd_iommu_init(bool xt)
         /*
          * Setting up of the IOMMU interrupts cannot occur yet at the (very
          * early) time we get here when enabling x2APIC mode. Suppress it
-         * here, and do it explicitly in amd_iommu_init_interrupt().
+         * here, and do it explicitly in amd_iommu_init_late().
          */
         rc = amd_iommu_init_one(iommu, !xt);
         if ( rc )
@@ -1441,11 +1479,16 @@ error_out:
     return rc;
 }
 
-int __init amd_iommu_init_interrupt(void)
+int __init amd_iommu_init_late(void)
 {
     struct amd_iommu *iommu;
     int rc = 0;
 
+    /* Further initialize the device table(s). */
+    pci_init = true;
+    if ( iommu_intremap )
+        rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
+
     for_each_amd_iommu ( iommu )
     {
         struct irq_desc *desc;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,8 +69,7 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_LENGTH 0xB
-#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
+#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -790,7 +789,7 @@ void amd_iommu_read_msi_from_ire(
     }
 }
 
-int __init amd_iommu_free_intremap_table(
+int amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
     void **tblp;
@@ -815,7 +814,7 @@ int __init amd_iommu_free_intremap_table
     return 0;
 }
 
-void *__init amd_iommu_alloc_intremap_table(
+void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
     unsigned int order = intremap_table_order(iommu);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -220,7 +220,9 @@ void __init amd_iommu_set_intremap_table
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT, &entry);
     /* Fixed and arbitrated interrupts remapepd */
-    set_field_in_reg_u32(2, entry,
+    set_field_in_reg_u32(intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
+                                      : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED,
+                         entry,
                          IOMMU_DEV_TABLE_INT_CONTROL_MASK,
                          IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry);
     dte[5] = entry;
@@ -229,7 +231,7 @@ void __init amd_iommu_set_intremap_table
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT, &entry);
     /* 2048 entries */
-    set_field_in_reg_u32(0xB, entry,
+    set_field_in_reg_u32(intremap_ptr ? IOMMU_INTREMAP_ORDER : 0, entry,
                          IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT, &entry);
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -191,7 +191,7 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    if ( (init_done ? amd_iommu_init_interrupt()
+    if ( (init_done ? amd_iommu_init_late()
                     : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
@@ -462,6 +462,7 @@ static int amd_iommu_add_device(u8 devfn
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
 
     if ( !pdev->domain )
         return -EINVAL;
@@ -491,6 +492,36 @@ static int amd_iommu_add_device(u8 devfn
         return -ENODEV;
     }
 
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( !ivrs_mappings ||
+         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
+        return -EPERM;
+
+    if ( iommu_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         !ivrs_mappings[bdf].intremap_table )
+    {
+        unsigned long flags;
+
+        ivrs_mappings[bdf].intremap_table =
+            amd_iommu_alloc_intremap_table(
+                iommu, &ivrs_mappings[bdf].intremap_inuse);
+        if ( !ivrs_mappings[bdf].intremap_table )
+            return -ENOMEM;
+
+        spin_lock_irqsave(&iommu->lock, flags);
+
+        amd_iommu_set_intremap_table(
+            iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+            virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+            iommu_intremap);
+
+        amd_iommu_flush_device(iommu, bdf);
+
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+
     amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
     return 0;
 }
@@ -499,6 +530,8 @@ static int amd_iommu_remove_device(u8 de
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
+
     if ( !pdev->domain )
         return -EINVAL;
 
@@ -514,6 +547,14 @@ static int amd_iommu_remove_device(u8 de
     }
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( amd_iommu_perdev_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         ivrs_mappings[bdf].intremap_table )
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+
     return 0;
 }
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,6 +107,9 @@
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
+/* For now, we always allocate the maximum: 2048 entries. */
+#define IOMMU_INTREMAP_ORDER			0xB
+
 /* DeviceTable Entry[31:0] */
 #define IOMMU_DEV_TABLE_VALID_MASK			0x00000001
 #define IOMMU_DEV_TABLE_VALID_SHIFT			0
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -50,7 +50,7 @@ void get_iommu_features(struct amd_iommu
 /* amd-iommu-init functions */
 int amd_iommu_prepare(bool xt);
 int amd_iommu_init(bool xt);
-int amd_iommu_init_interrupt(void);
+int amd_iommu_init_late(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 int iov_adjust_irq_affinities(void);
 
++++++ 5d8b730e-AMD-IOMMU-phantom-funcs-share-intremap-tables.patch ++++++

References: bsc#1135799

# Commit 4e5e40fbd5852b2b638e5d9cf775d52f93b3fecb
# Date 2019-09-25 16:00:46 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: make phantom functions share interrupt remapping tables

Rather than duplicating entries in amd_iommu_msi_msg_update_ire(), share
the tables. This mainly requires some care while freeing them, to avoid
freeing memory blocks twice.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Reviewed-by: Paul Durrant 

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1117,7 +1117,7 @@ static void __init amd_iommu_init_cleanu
         amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
                                                        struct amd_iommu,
                                                        list),
-                                      NULL);
+                                      NULL, 0);
 
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
@@ -1182,7 +1182,7 @@ int iterate_ivrs_mappings(int (*handler)
 }
 
 int iterate_ivrs_entries(int (*handler)(const struct amd_iommu *,
-                                        struct ivrs_mappings *))
+                                        struct ivrs_mappings *, uint16_t bdf))
 {
     u16 seg = 0;
     int rc = 0;
@@ -1199,7 +1199,7 @@ int iterate_ivrs_entries(int (*handler)(
             const struct amd_iommu *iommu = map[bdf].iommu;
 
             if ( iommu && map[bdf].dte_requestor_id == bdf )
-                rc = handler(iommu, &map[bdf]);
+                rc = handler(iommu, &map[bdf], bdf);
         }
     } while ( !rc && ++seg );
 
@@ -1292,20 +1292,29 @@ static int __init amd_iommu_setup_device
 
             if ( pdev )
             {
-                unsigned int req_id = bdf;
-
-                do {
-                    ivrs_mappings[req_id].intremap_table =
-                        amd_iommu_alloc_intremap_table(
-                            ivrs_mappings[bdf].iommu,
-                            &ivrs_mappings[req_id].intremap_inuse);
-                    if ( !ivrs_mappings[req_id].intremap_table )
-                        return -ENOMEM;
-
-                    if ( !pdev->phantom_stride )
-                        break;
-                    req_id += pdev->phantom_stride;
-                } while ( PCI_SLOT(req_id) == PCI_SLOT(pdev->devfn) );
+                ivrs_mappings[bdf].intremap_table =
+                    amd_iommu_alloc_intremap_table(
+                        ivrs_mappings[bdf].iommu,
+                        &ivrs_mappings[bdf].intremap_inuse);
+                if ( !ivrs_mappings[bdf].intremap_table )
+                    return -ENOMEM;
+
+                if ( pdev->phantom_stride )
+                {
+                    unsigned int req_id = bdf;
+
+                    for ( ; ; )
+                    {
+                        req_id += pdev->phantom_stride;
+                        if ( PCI_SLOT(req_id) != PCI_SLOT(pdev->devfn) )
+                            break;
+
+                        ivrs_mappings[req_id].intremap_table =
+                            ivrs_mappings[bdf].intremap_table;
+                        ivrs_mappings[req_id].intremap_inuse =
+                            ivrs_mappings[bdf].intremap_inuse;
+                    }
+                }
             }
 
             amd_iommu_set_intremap_table(
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -711,33 +711,20 @@ int amd_iommu_msi_msg_update_ire(
 
     if ( msi_desc->remap_index >= 0 && !msg )
     {
-        do {
-            update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                               &msi_desc->remap_index,
-                                               NULL, NULL);
-            if ( !pdev || !pdev->phantom_stride )
-                break;
-            bdf += pdev->phantom_stride;
-        } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
+        update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                           &msi_desc->remap_index,
+                                           NULL, NULL);
 
         for ( i = 0; i < nr; ++i )
             msi_desc[i].remap_index = -1;
-        if ( pdev )
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
     }
 
     if ( !msg )
         return 0;
 
-    do {
-        rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                                &msi_desc->remap_index,
-                                                msg, &data);
-        if ( rc || !pdev || !pdev->phantom_stride )
-            break;
-        bdf += pdev->phantom_stride;
-    } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-
+    rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                            &msi_desc->remap_index,
+                                            msg, &data);
     if ( !rc )
     {
         for ( i = 1; i < nr; ++i )
@@ -790,12 +777,27 @@ void amd_iommu_read_msi_from_ire(
 }
 
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
+    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping,
+    uint16_t bdf)
 {
     void **tblp;
 
     if ( ivrs_mapping )
     {
+        unsigned int i;
+
+        /*
+         * PCI device phantom functions use the same tables as their "base"
+         * function: Look ahead to zap the pointers.
+         */
+        for ( i = 1; PCI_FUNC(bdf + i) && bdf + i < ivrs_bdf_entries; ++i )
+            if ( ivrs_mapping[i].intremap_table ==
+                 ivrs_mapping->intremap_table )
+            {
+                ivrs_mapping[i].intremap_table = NULL;
+                ivrs_mapping[i].intremap_inuse = NULL;
+            }
+
         XFREE(ivrs_mapping->intremap_inuse);
         tblp = &ivrs_mapping->intremap_table;
     }
@@ -934,7 +936,8 @@ static void dump_intremap_table(const st
 }
 
 static int dump_intremap_mapping(const struct amd_iommu *iommu,
-                                 struct ivrs_mappings *ivrs_mapping)
+                                 struct ivrs_mappings *ivrs_mapping,
+                                 uint16_t unused)
 {
     unsigned long flags;
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -553,7 +553,7 @@ static int amd_iommu_remove_device(u8 de
     if ( amd_iommu_perdev_intremap &&
          ivrs_mappings[bdf].dte_requestor_id == bdf &&
          ivrs_mappings[bdf].intremap_table )
-        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf], bdf);
 
     return 0;
 }
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -136,7 +136,7 @@ extern u8 ivhd_type;
 struct ivrs_mappings *get_ivrs_mappings(u16 seg);
 int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *));
 int iterate_ivrs_entries(int (*)(const struct amd_iommu *,
-                                 struct ivrs_mappings *));
+                                 struct ivrs_mappings *, uint16_t));
 
 /* iommu tables in guest space */
 struct mmio_reg {
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -103,7 +103,7 @@ int amd_iommu_setup_ioapic_remapping(voi
 void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *, struct ivrs_mappings *);
+    const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
++++++ 5d8b733b-x86-PCI-read-max-MSI-vector-count-early.patch ++++++

References: bsc#1135799

# Commit 93a78101260b22510c5f823577ff826fbfc46136
# Date 2019-09-25 16:01:31 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
x86/PCI: read maximum MSI vector count early

Rather than doing this every time we set up interrupts for a device
anew (and then in several places) fill this invariant field right after
allocating struct pci_dev.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Reviewed-by: Paul Durrant 
Reviewed-by: Andrew Cooper 

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -688,7 +688,7 @@ static int msi_capability_init(struct pc
 {
     struct msi_desc *entry;
     int pos;
-    unsigned int i, maxvec, mpos;
+    unsigned int i, mpos;
     u16 control, seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
@@ -699,9 +699,8 @@ static int msi_capability_init(struct pc
     if ( !pos )
         return -ENODEV;
     control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
-    maxvec = multi_msi_capable(control);
-    if ( nvec > maxvec )
-        return maxvec;
+    if ( nvec > dev->msi_maxvec )
+        return dev->msi_maxvec;
     control &= ~PCI_MSI_FLAGS_QSIZE;
     multi_msi_enable(control, nvec);
 
@@ -735,7 +734,7 @@ static int msi_capability_init(struct pc
 
         /* All MSIs are unmasked by default, Mask them all */
         maskbits = pci_conf_read32(seg, bus, slot, func, mpos);
-        maskbits |= ~(u32)0 >> (32 - maxvec);
+        maskbits |= ~(uint32_t)0 >> (32 - dev->msi_maxvec);
         pci_conf_write32(seg, bus, slot, func, mpos, maskbits);
     }
     list_add_tail(&entry->list, &dev->msi_list);
@@ -1309,7 +1308,6 @@ int pci_msi_conf_write_intercept(struct
     entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
     if ( entry && entry->msi_attrib.maskbit )
     {
-        uint16_t cntl;
         uint32_t unused;
         unsigned int nvec = entry->msi.nvec;
 
@@ -1322,8 +1320,7 @@ int pci_msi_conf_write_intercept(struct
         if ( reg < entry->msi.mpos || reg >= entry->msi.mpos + 4 || size != 4 )
             return -EACCES;
 
-        cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
-        unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl));
+        unused = ~(uint32_t)0 >> (32 - pdev->msi_maxvec);
         for ( pos = 0; pos < nvec; ++pos, ++entry )
         {
             entry->msi_attrib.guest_masked =
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -347,6 +347,16 @@ static struct pci_dev *alloc_pdev(struct
     INIT_LIST_HEAD(&pdev->msi_list);
 
     pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                              PCI_CAP_ID_MSI);
+    if ( pos )
+    {
+        uint16_t ctrl = pci_conf_read16(pseg->nr, bus, PCI_SLOT(devfn),
+                                        PCI_FUNC(devfn), msi_control_reg(pos));
+
+        pdev->msi_maxvec = multi_msi_capable(ctrl);
+    }
+
+    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
                               PCI_CAP_ID_MSIX);
     if ( pos )
     {
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -27,7 +27,7 @@ static uint32_t control_read(const struc
 {
     const struct vpci_msi *msi = data;
 
-    return MASK_INSR(fls(msi->max_vectors) - 1, PCI_MSI_FLAGS_QMASK) |
+    return MASK_INSR(fls(pdev->msi_maxvec) - 1, PCI_MSI_FLAGS_QMASK) |
            MASK_INSR(fls(msi->vectors) - 1, PCI_MSI_FLAGS_QSIZE) |
            (msi->enabled ? PCI_MSI_FLAGS_ENABLE : 0) |
            (msi->masking ? PCI_MSI_FLAGS_MASKBIT : 0) |
@@ -40,7 +40,7 @@ static void control_write(const struct p
     struct vpci_msi *msi = data;
     unsigned int vectors = min_t(uint8_t,
                                  1u << MASK_EXTR(val, PCI_MSI_FLAGS_QSIZE),
-                                 msi->max_vectors);
+                                 pdev->msi_maxvec);
     bool new_enabled = val & PCI_MSI_FLAGS_ENABLE;
 
     /*
@@ -218,8 +218,7 @@ static int init_msi(struct pci_dev *pdev
      * FIXME: I've only been able to test this code with devices using a single
      * MSI interrupt and no mask register.
      */
-    pdev->vpci->msi->max_vectors = multi_msi_capable(control);
-    ASSERT(pdev->vpci->msi->max_vectors <= 32);
+    ASSERT(pdev->msi_maxvec <= 32);
 
     /* The multiple message enable is 0 after reset (1 message enabled). */
     pdev->vpci->msi->vectors = 1;
@@ -301,7 +300,7 @@ void vpci_dump_msi(void)
                 if ( msi->masking )
                     printk(" mask=%08x", msi->mask);
                 printk(" vectors max: %u enabled: %u\n",
-                       msi->max_vectors, msi->vectors);
+                       pdev->msi_maxvec, msi->vectors);
 
                 vpci_msi_arch_print(msi);
             }
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -84,7 +84,8 @@ struct pci_dev {
     const u8 bus;
     const u8 devfn;
 
-    u8 phantom_stride;
+    uint8_t msi_maxvec;
+    uint8_t phantom_stride;
 
     nodeid_t node; /* NUMA node */
 
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -99,14 +99,12 @@ struct vpci {
         uint32_t mask;
         /* Data. */
         uint16_t data;
-        /* Maximum number of vectors supported by the device. */
-        uint8_t max_vectors : 6;
+        /* Number of vectors configured. */
+        uint8_t vectors     : 6;
         /* Supports per-vector masking? */
         bool masking        : 1;
         /* 64-bit address capable? */
         bool address64      : 1;
-        /* Number of vectors configured. */
-        uint8_t vectors     : 6;
         /* Enabled? */
         bool enabled        : 1;
         /* Arch-specific data. */
++++++ 5d8b736d-AMD-IOMMU-replace-INTREMAP_ENTRIES.patch ++++++

References: bsc#1135799

# Commit bb038f311687554483fb997cfee213760029aeaf
# Date 2019-09-25 16:02:21 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: replace INTREMAP_ENTRIES

Prepare for the number of entries to not be the maximum possible, by
separating checks against maximum size from ones against actual size.
For caller side simplicity have alloc_intremap_entry() return the
maximum possible value upon allocation failure, rather than the first
just out-of-bounds one.

Have the involved functions already take all the subsequently needed
arguments here already, to reduce code churn in the patch actually
making the allocation size dynamic.

Signed-off-by: Jan Beulich 
Acked-by: Andrew Cooper 
Reviewed-by: Paul Durrant 

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,7 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -83,8 +83,20 @@ static void dump_intremap_tables(unsigne
 static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
 {
     return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
+}
+
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return IOMMU_INTREMAP_ORDER;
+}
+
+static unsigned int intremap_table_entries(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return 1u << amd_iommu_intremap_table_order(irt, iommu);
 }
 
 unsigned int ioapic_id_to_index(unsigned int apic_id)
@@ -122,20 +134,24 @@ static int get_intremap_requestor_id(int
     return get_ivrs_mappings(seg)[bdf].dte_requestor_id;
 }
 
-static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr)
+static unsigned int alloc_intremap_entry(const struct amd_iommu *iommu,
+                                         unsigned int bdf, unsigned int nr)
 {
-    unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse;
-    unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
+    unsigned long *inuse = ivrs_mappings[bdf].intremap_inuse;
+    unsigned int nr_ents =
+        intremap_table_entries(ivrs_mappings[bdf].intremap_table, iommu);
+    unsigned int slot = find_first_zero_bit(inuse, nr_ents);
 
     for ( ; ; )
     {
         unsigned int end;
 
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1);
-        if ( end > INTREMAP_ENTRIES )
-            end = INTREMAP_ENTRIES;
+        end = find_next_bit(inuse, nr_ents, slot + 1);
+        if ( end > nr_ents )
+            end = nr_ents;
         slot = (slot + nr - 1) & ~(nr - 1);
         if ( slot + nr <= end )
         {
@@ -144,12 +160,12 @@ static unsigned int alloc_intremap_entry
             break;
         }
         slot = (end + nr) & ~(nr - 1);
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot);
+        slot = find_next_zero_bit(inuse, nr_ents, slot);
     }
 
-    return slot;
+    return slot < nr_ents ? slot : INTREMAP_MAX_ENTRIES;
 }
 
 static union irte_ptr get_intremap_entry(const struct amd_iommu *iommu,
@@ -159,7 +175,7 @@ static union irte_ptr get_intremap_entry
         .ptr = get_ivrs_mappings(iommu->seg)[bdf].intremap_table
     };
 
-    ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
+    ASSERT(table.ptr && (index < intremap_table_entries(table.ptr, iommu)));
 
     if ( iommu->ctrl.ga_en )
         table.ptr128 += index;
@@ -279,10 +295,10 @@ static int update_intremap_entry_from_io
     spin_lock_irqsave(lock, flags);
 
     offset = *index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
-        offset = alloc_intremap_entry(iommu->seg, req_id, 1);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, req_id, 1);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             rte->mask = 1;
@@ -400,8 +416,8 @@ int __init amd_iommu_setup_ioapic_remapp
             }
 
             spin_lock_irqsave(lock, flags);
-            offset = alloc_intremap_entry(seg, req_id, 1);
-            BUG_ON(offset >= INTREMAP_ENTRIES);
+            offset = alloc_intremap_entry(iommu, req_id, 1);
+            BUG_ON(offset >= INTREMAP_MAX_ENTRIES);
             entry = get_intremap_entry(iommu, req_id, offset);
             update_intremap_entry(iommu, entry, vector,
                                   delivery_mode, dest_mode, dest);
@@ -476,7 +492,7 @@ void amd_iommu_ioapic_update_ire(
         *(((u32 *)&new_rte) + 1) = value;
     }
 
-    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(saved_mask);
 
@@ -548,7 +564,7 @@ unsigned int amd_iommu_read_ioapic_from_
         return val;
 
     offset = ioapic_sbdf[idx].pin_2_idx[pin];
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
         return val;
 
     seg = ioapic_sbdf[idx].seg;
@@ -561,8 +577,8 @@ unsigned int amd_iommu_read_ioapic_from_
 
     if ( !(reg & 1) )
     {
-        ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        val &= ~(INTREMAP_ENTRIES - 1);
+        ASSERT(offset == (val & (INTREMAP_MAX_ENTRIES - 1)));
+        val &= ~(INTREMAP_MAX_ENTRIES - 1);
         /* The IntType fields match for both formats. */
         val |= MASK_INSR(entry.ptr32->flds.int_type,
                          IO_APIC_REDIR_DELIV_MODE_MASK);
@@ -622,11 +638,11 @@ static int update_intremap_entry_from_ms
         dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
 
     offset = *remap_index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(nr);
-        offset = alloc_intremap_entry(iommu->seg, bdf, nr);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, bdf, nr);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             return -ENOSPC;
@@ -654,7 +670,7 @@ static int update_intremap_entry_from_ms
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
-    *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset;
+    *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
 
     /*
      * In some special cases, a pci-e device(e.g SATA controller in IDE mode)
@@ -738,7 +754,7 @@ int amd_iommu_msi_msg_update_ire(
 void amd_iommu_read_msi_from_ire(
     struct msi_desc *msi_desc, struct msi_msg *msg)
 {
-    unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1);
+    unsigned int offset = msg->data & (INTREMAP_MAX_ENTRIES - 1);
     const struct pci_dev *pdev = msi_desc->dev;
     u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
     u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
@@ -758,7 +774,7 @@ void amd_iommu_read_msi_from_ire(
         offset |= nr;
     }
 
-    msg->data &= ~(INTREMAP_ENTRIES - 1);
+    msg->data &= ~(INTREMAP_MAX_ENTRIES - 1);
     /* The IntType fields match for both formats. */
     msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                            MSI_DATA_DELIVERY_MODE_MASK);
@@ -824,8 +840,9 @@ void *amd_iommu_alloc_intremap_table(
 
     if ( tb )
     {
-        *inuse_map = xzalloc_array(unsigned long,
-                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        unsigned int nr = intremap_table_entries(tb, iommu);
+
+        *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
         else
@@ -869,6 +886,7 @@ bool __init iov_supports_xt(void)
 
 int __init amd_setup_hpet_msi(struct msi_desc *msi_desc)
 {
+    const struct amd_iommu *iommu;
     spinlock_t *lock;
     unsigned long flags;
     int rc = 0;
@@ -886,12 +904,15 @@ int __init amd_setup_hpet_msi(struct msi
         return -ENODEV;
     }
 
+    iommu = find_iommu_for_device(hpet_sbdf.seg, hpet_sbdf.bdf);
+    if ( !iommu )
+        return -ENXIO;
+
     lock = get_intremap_lock(hpet_sbdf.seg, hpet_sbdf.bdf);
     spin_lock_irqsave(lock, flags);
 
-    msi_desc->remap_index = alloc_intremap_entry(hpet_sbdf.seg,
-                                                 hpet_sbdf.bdf, 1);
-    if ( msi_desc->remap_index >= INTREMAP_ENTRIES )
+    msi_desc->remap_index = alloc_intremap_entry(iommu, hpet_sbdf.bdf, 1);
+    if ( msi_desc->remap_index >= INTREMAP_MAX_ENTRIES )
     {
         msi_desc->remap_index = -1;
         rc = -ENXIO;
@@ -906,12 +927,12 @@ static void dump_intremap_table(const st
                                 union irte_cptr tbl,
                                 const struct ivrs_mappings *ivrs_mapping)
 {
-    unsigned int count;
+    unsigned int count, nr = intremap_table_entries(tbl.ptr, iommu);
 
     if ( !tbl.ptr )
         return;
 
-    for ( count = 0; count < INTREMAP_ENTRIES; count++ )
+    for ( count = 0; count < nr; count++ )
     {
         if ( iommu->ctrl.ga_en
              ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -104,6 +104,8 @@ void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
++++++ 5d8b7393-AMD-IOMMU-restrict-intremap-table-sizes.patch ++++++

References: bsc#1135799

# Commit 34a9ef62d3d29ddafbec6b3409074dd67eba7109
# Date 2019-09-25 16:02:59 +0200
# Author Jan Beulich 
# Committer Jan Beulich 
AMD/IOMMU: restrict interrupt remapping table sizes

There's no point setting up tables with more space than a PCI device can
use. For both MSI and MSI-X we can determine how many interrupts could
be set up at most. Tables allocated during ACPI table parsing, however,
will (for now at least) continue to be set up to have maximum size.

Note that until we would want to use sub-page allocations here there's
no point checking whether both MSI and MSI-X are supported by a device -
an order-0 allocation will fit the dual case in any event, no matter
that the MSI-X vector count may be smaller than the MSI one.

On my Rome system this reduces space needed from just over 1k pages to
about 125.

Suggested-by: Andrew Cooper 
Signed-off-by: Jan Beulich 
Reviewed-by: Paul Durrant 
Reviewed-by: Andrew Cooper 

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -77,7 +77,7 @@ static void __init add_ivrs_mapping_entr
         {
             if ( !shared_intremap_table )
                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                    iommu, &shared_intremap_inuse);
+                    iommu, &shared_intremap_inuse, 0);
 
             if ( !shared_intremap_table )
                 panic("No memory for shared IRT\n");
@@ -89,7 +89,7 @@ static void __init add_ivrs_mapping_entr
         {
             ivrs_mappings[alias_id].intremap_table =
                 amd_iommu_alloc_intremap_table(
-                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse, 0);
 
             if ( !ivrs_mappings[alias_id].intremap_table )
                 panic("No memory for %04x:%02x:%02x.%u's IRT\n",
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1290,12 +1290,14 @@ static int __init amd_iommu_setup_device
                 pcidevs_unlock();
             }
 
-            if ( pdev )
+            if ( pdev && (pdev->msix || pdev->msi_maxvec) )
             {
                 ivrs_mappings[bdf].intremap_table =
                     amd_iommu_alloc_intremap_table(
                         ivrs_mappings[bdf].iommu,
-                        &ivrs_mappings[bdf].intremap_inuse);
+                        &ivrs_mappings[bdf].intremap_inuse,
+                        pdev->msix ? pdev->msix->nr_entries
+                                   : pdev->msi_maxvec);
                 if ( !ivrs_mappings[bdf].intremap_table )
                     return -ENOMEM;
 
@@ -1318,11 +1320,8 @@ static int __init amd_iommu_setup_device
             }
 
             amd_iommu_set_intremap_table(
-                dte,
-                ivrs_mappings[bdf].intremap_table
-                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-                : 0,
-                iommu_intremap);
+                dte, ivrs_mappings[bdf].intremap_table,
+                ivrs_mappings[bdf].iommu, iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,8 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ORDER   0xB
+#define INTREMAP_MAX_ENTRIES (1 << INTREMAP_MAX_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
 
 static void dump_intremap_tables(unsigned char key);
 
-static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
-{
-    return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
-}
+#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
 
 unsigned int amd_iommu_intremap_table_order(
     const void *irt, const struct amd_iommu *iommu)
 {
-    return IOMMU_INTREMAP_ORDER;
+    return intremap_page_order(irt) + PAGE_SHIFT -
+           (iommu->ctrl.ga_en ? 4 : 2);
 }
 
 static unsigned int intremap_table_entries(
@@ -825,7 +822,10 @@ int amd_iommu_free_intremap_table(
 
     if ( *tblp )
     {
-        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        unsigned int order = intremap_page_order(*tblp);
+
+        intremap_page_order(*tblp) = 0;
+        __free_amd_iommu_tables(*tblp, order);
         *tblp = NULL;
     }
 
@@ -833,15 +833,23 @@ int amd_iommu_free_intremap_table(
 }
 
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *iommu, unsigned long **inuse_map)
+    const struct amd_iommu *iommu, unsigned long **inuse_map, unsigned int nr)
 {
-    unsigned int order = intremap_table_order(iommu);
-    void *tb = __alloc_amd_iommu_tables(order);
+    unsigned int order;
+    void *tb;
 
+    if ( !nr )
+        nr = INTREMAP_MAX_ENTRIES;
+
+    order = iommu->ctrl.ga_en
+            ? get_order_from_bytes(nr * sizeof(union irte128))
+            : get_order_from_bytes(nr * sizeof(union irte32));
+
+    tb = __alloc_amd_iommu_tables(order);
     if ( tb )
     {
-        unsigned int nr = intremap_table_entries(tb, iommu);
-
+        intremap_page_order(tb) = order;
+        nr = intremap_table_entries(tb, iommu);
         *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -208,20 +208,26 @@ void iommu_dte_set_iotlb(uint32_t *dte,
 }
 
 void __init amd_iommu_set_intremap_table(
-    uint32_t *dte, uint64_t intremap_ptr, uint8_t int_valid)
+    uint32_t *dte, const void *ptr, const struct amd_iommu *iommu,
+    bool int_valid)
 {
     uint32_t addr_hi, addr_lo, entry;
 
-    addr_lo = intremap_ptr & DMA_32BIT_MASK;
-    addr_hi = intremap_ptr >> 32;
+    if ( ptr )
+    {
+        addr_lo = virt_to_maddr(ptr) & DMA_32BIT_MASK;
+        addr_hi = virt_to_maddr(ptr) >> 32;
+    }
+    else
+        addr_hi = addr_lo = 0;
 
     entry = dte[5];
     set_field_in_reg_u32(addr_hi, entry,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT, &entry);
     /* Fixed and arbitrated interrupts remapepd */
-    set_field_in_reg_u32(intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
-                                      : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED,
+    set_field_in_reg_u32(ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
+                             : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED,
                          entry,
                          IOMMU_DEV_TABLE_INT_CONTROL_MASK,
                          IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry);
@@ -230,8 +236,8 @@ void __init amd_iommu_set_intremap_table
     set_field_in_reg_u32(addr_lo >> 6, 0,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT, &entry);
-    /* 2048 entries */
-    set_field_in_reg_u32(intremap_ptr ? IOMMU_INTREMAP_ORDER : 0, entry,
+    set_field_in_reg_u32(ptr ? amd_iommu_intremap_table_order(ptr, iommu) : 0,
+                         entry,
                          IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK,
                          IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT, &entry);
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -504,18 +504,22 @@ static int amd_iommu_add_device(u8 devfn
     {
         unsigned long flags;
 
-        ivrs_mappings[bdf].intremap_table =
-            amd_iommu_alloc_intremap_table(
-                iommu, &ivrs_mappings[bdf].intremap_inuse);
-        if ( !ivrs_mappings[bdf].intremap_table )
-            return -ENOMEM;
+        if ( pdev->msix || pdev->msi_maxvec )
+        {
+            ivrs_mappings[bdf].intremap_table =
+                amd_iommu_alloc_intremap_table(
+                    iommu, &ivrs_mappings[bdf].intremap_inuse,
+                    pdev->msix ? pdev->msix->nr_entries
+                               : pdev->msi_maxvec);
+            if ( !ivrs_mappings[bdf].intremap_table )
+                return -ENOMEM;
+        }
 
         spin_lock_irqsave(&iommu->lock, flags);
 
         amd_iommu_set_intremap_table(
             iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
-            virt_to_maddr(ivrs_mappings[bdf].intremap_table),
-            iommu_intremap);
+            ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
 
         amd_iommu_flush_device(iommu, bdf);
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,9 +107,6 @@
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
-/* For now, we always allocate the maximum: 2048 entries. */
-#define IOMMU_INTREMAP_ORDER			0xB
-
 /* DeviceTable Entry[31:0] */
 #define IOMMU_DEV_TABLE_VALID_MASK			0x00000001
 #define IOMMU_DEV_TABLE_VALID_SHIFT			0
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -76,7 +76,7 @@ void amd_iommu_share_p2m(struct domain *
 /* device table functions */
 int get_dma_requestor_id(u16 seg, u16 bdf);
 void amd_iommu_set_intremap_table(
-    u32 *dte, u64 intremap_ptr, u8 int_valid);
+    uint32_t *dte, const void *ptr, const struct amd_iommu *iommu, bool valid);
 void amd_iommu_set_root_page_table(
     u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid);
 void iommu_dte_set_iotlb(u32 *dte, u8 i);
@@ -101,7 +101,7 @@ struct amd_iommu *find_iommu_for_device(
 bool iov_supports_xt(void);
 int amd_iommu_setup_ioapic_remapping(void);
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *, unsigned long **);
+    const struct amd_iommu *, unsigned long **, unsigned int nr);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 unsigned int amd_iommu_intremap_table_order(
++++++ libxl.helper_done-crash.patch ++++++
...
From fb0f946726ff8aaa15b76bc3ec3b18878851a447 Mon Sep 17 00:00:00 2001
From: Olaf Hering 
Date: Fri, 27 Sep 2019 18:06:12 +0200
Subject: libxl: fix crash in helper_done due to uninitialized data
A crash in helper_done, called from libxl_domain_suspend, was reported,
triggered by 'virsh migrate --live xen+ssh://host':

 #1 helper_done (...) at libxl_save_callout.c:371
  helper_failed
  helper_stop
  libxl__save_helper_abort
 #2 check_all_finished (..., rc=-3) at libxl_stream_write.c:671
  stream_done
  stream_complete
  write_done
  dc->callback == write_done
  efd->func == datacopier_writable
 #3 afterpoll_internal (...) at libxl_event.c:1269

This is triggered by a failed poll, the actual error was:

libxl_aoutils.c:328:datacopier_writable: unexpected poll event 0x1c on fd 37 (should be POLLOUT) writing libxc header during copy of save v2 stream

In this case revents in datacopier_writable is POLLHUP|POLLERR|POLLOUT,
which triggers datacopier_callback. In helper_done,
shs->completion_callback is still zero. libxl__xc_domain_save fills
dss.sws.shs. But that function is only called after stream_header_done.
Any error before that will leave dss partly uninitialized.

Fix this crash by checking if ->completion_callback is valid.

Signed-off-by: Olaf Hering 
---
 tools/libxl/libxl_save_callout.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 6452d70036..89a2f6ecf0 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -368,8 +368,9 @@ static void helper_done(libxl__egc *egc, libxl__save_helper_state *shs)
     assert(!libxl__save_helper_inuse(shs));
 
     shs->egc = egc;
-    shs->completion_callback(egc, shs->caller_state,
-                             shs->rc, shs->retval, shs->errnoval);
+    if (shs->completion_callback)
+        shs->completion_callback(egc, shs->caller_state,
+                                 shs->rc, shs->retval, shs->errnoval);
     shs->egc = 0;
 }
 
++++++ x86-ioapic-ack-default.patch ++++++
--- /var/tmp/diff_new_pack.jOm0Z1/_old	2019-10-05 16:19:22.005583846 +0200
+++ /var/tmp/diff_new_pack.jOm0Z1/_new	2019-10-05 16:19:22.005583846 +0200
@@ -1,10 +1,10 @@
 Change default IO-APIC ack mode for single IO-APIC systems to old-style.
 
-Index: xen-4.10.0-testing/xen/arch/x86/io_apic.c
+Index: xen-4.12.1-testing/xen/arch/x86/io_apic.c
 ===================================================================
---- xen-4.10.0-testing.orig/xen/arch/x86/io_apic.c
-+++ xen-4.10.0-testing/xen/arch/x86/io_apic.c
-@@ -2029,7 +2029,10 @@ void __init setup_IO_APIC(void)
+--- xen-4.12.1-testing.orig/xen/arch/x86/io_apic.c
++++ xen-4.12.1-testing/xen/arch/x86/io_apic.c
+@@ -2028,7 +2028,10 @@ void __init setup_IO_APIC(void)
          io_apic_irqs = ~PIC_IRQS;
  
      printk("ENABLING IO-APIC IRQs\n");