commit xen for openSUSE:Factory

2 Nov 2020

Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in at 2020-11-02 09:38:21
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.3463 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "xen"

Mon Nov  2 09:38:21 2020 rev:292 rq:844630 version:4.14.0_10

Changes:
--------

--- /work/SRC/openSUSE:Factory/xen/xen.changes	2020-09-25 16:21:30.839358028 +0200
+++ /work/SRC/openSUSE:Factory/.xen.new.3463/xen.changes	2020-11-02 09:38:43.337513714 +0100
@@ -1,0 +2,81 @@
+Wed Oct 21 09:34:32 MDT 2020 - carnold@suse.com
+
+- Upstream bug fixes (bsc#1027519)
+  5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch
+  5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch
+  5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch
+  5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch
+  5f5b6b7a-hypfs-fix-custom-param-writes.patch
+  5f607915-x86-HVM-more-consistent-IO-completion.patch
+  5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch
+  5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch
+  5f71a21e-x86-S3-fix-shadow-stack-resume.patch
+  5f76ca65-evtchn-Flask-prealloc-for-send.patch
+  5f76caaf-evtchn-FIFO-use-stable-fields.patch
+  5f897c25-x86-traps-fix-read_registers-for-DF.patch
+  5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch
+- Renamed patches
+  5f560c42-x86-PV-64bit-segbase-consistency.patch
+    Replaces 5f5b6951-x86-PV-64bit-segbase-consistency.patch
+  5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch
+    Replaces 5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
+  5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch
+    Replaces 5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
+  5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch
+    Replaces 5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
+  5f6a008e-x86-MSI-drop-read_msi_msg.patch
+    Replaces 5f6a05fa-msi-get-rid-of-read_msi_msg.patch
+  5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch
+    Replaces 5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
+  5f6a00c4-evtchn-relax-port_is_valid.patch
+    Replaces 5f6a062c-evtchn-relax-port_is_valid.patch
+  5f6a00df-x86-PV-avoid-double-exception-injection.patch
+    Replaces 5f6a065c-pv-Avoid-double-exception-injection.patch
+  5f6a00f4-evtchn-add-missing-barriers.patch
+    Replaces 5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
+  5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch
+    Replaces 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
+  5f6a013f-evtchn_reset-shouldnt-succeed-with.patch
+    Replaces 5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
+  5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch
+    Replaces 5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
+  5f6a0178-evtchn-address-races-with-evtchn_reset.patch
+    Replaces 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
+  5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch
+    Replaces 5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
+  5f6a01c6-evtchn-preempt-in-evtchn_reset.patch
+    Replaces 5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch
+
+-------------------------------------------------------------------
+Tue Oct 13 10:48:04 MDT 2020 - carnold@suse.com
+
+- bsc#1177409 - VUL-0: xen: x86 PV guest INVLPG-like flushes may
+  leave stale TLB entries (XSA-286)
+  xsa286-1.patch
+  xsa286-2.patch
+  xsa286-3.patch
+  xsa286-4.patch
+  xsa286-5.patch
+  xsa286-6.patch
+- bsc#1177412 - VUL-0: xen: Race condition in Xen mapping code
+  (XSA-345)
+  5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch
+  5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch
+  5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch
+- bsc#1177413 - VUL-0: xen: undue deferral of IOMMU TLB flushes
+  (XSA-346)
+  5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch
+  5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch
+- bsc#1177414 - VUL-0: xen: unsafe AMD IOMMU page table updates
+  (XSA-347)
+  5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch
+  5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch
+  5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch
+
+-------------------------------------------------------------------
+Mon Oct 12 10:10:10 UTC 2020 - ohering@suse.de
+
+- Update libxc.sr.superpage.patch
+  set errno in x86_hvm_alloc_4k (bsc#1177112)
+
+-------------------------------------------------------------------

Old:
----
  5f5b6951-x86-PV-64bit-segbase-consistency.patch
  5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
  5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
  5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
  5f6a05fa-msi-get-rid-of-read_msi_msg.patch
  5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
  5f6a062c-evtchn-relax-port_is_valid.patch
  5f6a065c-pv-Avoid-double-exception-injection.patch
  5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
  5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
  5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
  5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
  5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
  5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
  5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch

New:
----
  5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch
  5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch
  5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch
  5f560c42-x86-PV-64bit-segbase-consistency.patch
  5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch
  5f5b6b7a-hypfs-fix-custom-param-writes.patch
  5f607915-x86-HVM-more-consistent-IO-completion.patch
  5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch
  5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch
  5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch
  5f6a008e-x86-MSI-drop-read_msi_msg.patch
  5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch
  5f6a00c4-evtchn-relax-port_is_valid.patch
  5f6a00df-x86-PV-avoid-double-exception-injection.patch
  5f6a00f4-evtchn-add-missing-barriers.patch
  5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch
  5f6a013f-evtchn_reset-shouldnt-succeed-with.patch
  5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch
  5f6a0178-evtchn-address-races-with-evtchn_reset.patch
  5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch
  5f6a01c6-evtchn-preempt-in-evtchn_reset.patch
  5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch
  5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch
  5f71a21e-x86-S3-fix-shadow-stack-resume.patch
  5f76ca65-evtchn-Flask-prealloc-for-send.patch
  5f76caaf-evtchn-FIFO-use-stable-fields.patch
  5f897c25-x86-traps-fix-read_registers-for-DF.patch
  5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch
  5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch
  5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch
  5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch
  5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch
  5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch
  5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch
  5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch
  5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch
  xsa286-1.patch
  xsa286-2.patch
  xsa286-3.patch
  xsa286-4.patch
  xsa286-5.patch
  xsa286-6.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.AogIHC/_old	2020-11-02 09:38:45.029515337 +0100
+++ /var/tmp/diff_new_pack.AogIHC/_new	2020-11-02 09:38:45.033515341 +0100
@@ -125,7 +125,7 @@
 BuildRequires:  pesign-obs-integration
 %endif
 
-Version:        4.14.0_08
+Version:        4.14.0_10
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -165,21 +165,48 @@
 # Upstream patches
 Patch1:         5f1a9916-x86-S3-put-data-sregs-into-known-state.patch
 Patch2:         5f21b9fd-x86-cpuid-APIC-bit-clearing.patch
-Patch3:         5f5b6951-x86-PV-64bit-segbase-consistency.patch
-Patch4:         5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
-Patch5:         5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
-Patch6:         5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
-Patch7:         5f6a05fa-msi-get-rid-of-read_msi_msg.patch
-Patch8:         5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
-Patch9:         5f6a062c-evtchn-relax-port_is_valid.patch
-Patch10:        5f6a065c-pv-Avoid-double-exception-injection.patch
-Patch11:        5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
-Patch12:        5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
-Patch13:        5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
-Patch14:        5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
-Patch15:        5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
-Patch16:        5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
-Patch17:        5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch
+Patch3:         5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch
+Patch4:         5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch
+Patch5:         5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch
+Patch6:         5f560c42-x86-PV-64bit-segbase-consistency.patch
+Patch7:         5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch
+Patch8:         5f5b6b7a-hypfs-fix-custom-param-writes.patch
+Patch9:         5f607915-x86-HVM-more-consistent-IO-completion.patch
+Patch10:        5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch
+Patch11:        5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch
+Patch12:        5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch
+Patch13:        5f6a008e-x86-MSI-drop-read_msi_msg.patch
+Patch14:        5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch
+Patch15:        5f6a00c4-evtchn-relax-port_is_valid.patch
+Patch16:        5f6a00df-x86-PV-avoid-double-exception-injection.patch
+Patch17:        5f6a00f4-evtchn-add-missing-barriers.patch
+Patch18:        5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch
+Patch19:        5f6a013f-evtchn_reset-shouldnt-succeed-with.patch
+Patch20:        5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch
+Patch21:        5f6a0178-evtchn-address-races-with-evtchn_reset.patch
+Patch22:        5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch
+Patch23:        5f6a01c6-evtchn-preempt-in-evtchn_reset.patch
+Patch24:        5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch
+Patch25:        5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch
+Patch26:        5f71a21e-x86-S3-fix-shadow-stack-resume.patch
+Patch27:        5f76ca65-evtchn-Flask-prealloc-for-send.patch
+Patch28:        5f76caaf-evtchn-FIFO-use-stable-fields.patch
+Patch29:        5f897c25-x86-traps-fix-read_registers-for-DF.patch
+Patch30:        5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch
+Patch31:        5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch
+Patch32:        5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch
+Patch33:        5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch
+Patch34:        5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch
+Patch35:        5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch
+Patch36:        5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch
+Patch37:        5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch
+Patch38:        5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch
+Patch28601:     xsa286-1.patch
+Patch28602:     xsa286-2.patch
+Patch28603:     xsa286-3.patch
+Patch28604:     xsa286-4.patch
+Patch28605:     xsa286-5.patch
+Patch28606:     xsa286-6.patch
 # Our platform specific patches
 Patch400:       xen-destdir.patch
 Patch401:       vif-bridge-no-iptables.patch
@@ -258,8 +285,8 @@
 This package contains the libraries used to interact with the Xen
 virtual machine monitor.
 
-In addition to this package you need to install kernel-xen, xen and
-xen-tools to use Xen.
+In addition to this package you need to install xen and xen-tools
+to use Xen.
 
 
 Authors:
@@ -304,8 +331,8 @@
 This package contains the control tools that allow you to start, stop,
 migrate, and manage virtual machines.
 
-In addition to this package you need to install kernel-xen, xen and
-xen-libs to use Xen.
+In addition to this package you need to install xen and xen-libs
+to use Xen.
 
 
 Authors:
@@ -424,6 +451,33 @@
 %patch15 -p1
 %patch16 -p1
 %patch17 -p1
+%patch18 -p1
+%patch19 -p1
+%patch20 -p1
+%patch21 -p1
+%patch22 -p1
+%patch23 -p1
+%patch24 -p1
+%patch25 -p1
+%patch26 -p1
+%patch27 -p1
+%patch28 -p1
+%patch29 -p1
+%patch30 -p1
+%patch31 -p1
+%patch32 -p1
+%patch33 -p1
+%patch34 -p1
+%patch35 -p1
+%patch36 -p1
+%patch37 -p1
+%patch38 -p1
+%patch28601 -p1
+%patch28602 -p1
+%patch28603 -p1
+%patch28604 -p1
+%patch28605 -p1
+%patch28606 -p1
 # Our platform specific patches
 %patch400 -p1
 %patch401 -p1

++++++ 5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch ++++++
# Commit e32605b07ef2e01c9d05da9b2d5d7b8f9a5c7c1b
# Date 2020-08-27 12:48:46 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86: Begin to introduce support for MSR_ARCH_CAPS

... including serialisation/deserialisation logic and unit tests.

There is no current way to configure this MSR correctly for guests.
The toolstack side this logic needs building, which is far easier to
do with it in place.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/cpu-policy/test-cpu-policy.c
+++ b/tools/tests/cpu-policy/test-cpu-policy.c
@@ -374,6 +374,11 @@ static void test_msr_deserialise_failure
             .msr = { .idx = 0xce, .val = ~0ull },
             .rc = -EOVERFLOW,
         },
+        {
+            .name = "truncated val",
+            .msr = { .idx = 0x10a, .val = ~0ull },
+            .rc = -EOVERFLOW,
+        },
     };
 
     printf("Testing MSR deserialise failure:\n");
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -220,8 +220,10 @@ int guest_rdmsr(struct vcpu *v, uint32_t
         break;
 
     case MSR_ARCH_CAPABILITIES:
-        /* Not implemented yet. */
-        goto gp_fault;
+        if ( !cp->feat.arch_caps )
+            goto gp_fault;
+        *val = mp->arch_caps.raw;
+        break;
 
     case MSR_INTEL_MISC_FEATURES_ENABLES:
         *val = msrs->misc_features_enables.raw;
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -267,7 +267,7 @@ XEN_CPUFEATURE(CET_IBT,       9*32+20) /
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
 XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*S  MSR_FLUSH_CMD and L1D flush. */
-XEN_CPUFEATURE(ARCH_CAPS,     9*32+29) /*   IA32_ARCH_CAPABILITIES MSR */
+XEN_CPUFEATURE(ARCH_CAPS,     9*32+29) /*a  IA32_ARCH_CAPABILITIES MSR */
 XEN_CPUFEATURE(CORE_CAPS,     9*32+30) /*   IA32_CORE_CAPABILITIES MSR */
 XEN_CPUFEATURE(SSBD,          9*32+31) /*A  MSR_SPEC_CTRL.SSBD available */
 
--- a/xen/include/xen/lib/x86/msr.h
+++ b/xen/include/xen/lib/x86/msr.h
@@ -3,7 +3,7 @@
 #define XEN_LIB_X86_MSR_H
 
 /* Maximum number of MSRs written when serialising msr_policy. */
-#define MSR_MAX_SERIALISED_ENTRIES 1
+#define MSR_MAX_SERIALISED_ENTRIES 2
 
 /* MSR policy object for shared per-domain MSRs */
 struct msr_policy
@@ -23,6 +23,28 @@ struct msr_policy
             bool cpuid_faulting:1;
         };
     } platform_info;
+
+    /*
+     * 0x0000010a - MSR_ARCH_CAPABILITIES
+     *
+     * This is an Intel-only MSR, which provides miscellaneous enumeration,
+     * including those which indicate that microarchitectrual sidechannels are
+     * fixed in hardware.
+     */
+    union {
+        uint32_t raw;
+        struct {
+            bool rdcl_no:1;
+            bool ibrs_all:1;
+            bool rsba:1;
+            bool skip_l1dfl:1;
+            bool ssb_no:1;
+            bool mds_no:1;
+            bool if_pschange_mc_no:1;
+            bool tsx_ctrl:1;
+            bool taa_no:1;
+        };
+    } arch_caps;
 };
 
 #ifdef __XEN__
--- a/xen/lib/x86/msr.c
+++ b/xen/lib/x86/msr.c
@@ -39,6 +39,7 @@ int x86_msr_copy_to_buffer(const struct
     })
 
     COPY_MSR(MSR_INTEL_PLATFORM_INFO, p->platform_info.raw);
+    COPY_MSR(MSR_ARCH_CAPABILITIES,   p->arch_caps.raw);
 
 #undef COPY_MSR
 
@@ -99,6 +100,7 @@ int x86_msr_copy_from_buffer(struct msr_
 })
 
         case MSR_INTEL_PLATFORM_INFO: ASSIGN(platform_info.raw); break;
+        case MSR_ARCH_CAPABILITIES:   ASSIGN(arch_caps.raw);     break;
 
 #undef ASSIGN
 
++++++ 5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch ++++++
# Commit e46474278a0e87e2b32ad5dd5fc20e8d2cb0688b
# Date 2020-08-31 13:43:26 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/intel: Expose MSR_ARCH_CAPS to dom0

The overhead of (the lack of) MDS_NO alone has been measured at 30% on some
workloads.  While we're not in a position yet to offer MSR_ARCH_CAPS generally
to guests, dom0 doesn't migrate, so we can pass a subset of hardware values
straight through.

This will cause PVH dom0's not to use KPTI by default, and all dom0's not to
use VERW flushing by default, and to use eIBRS in preference to retpoline on
recent Intel CPUs.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -719,6 +719,14 @@ int init_domain_cpuid_policy(struct doma
     if ( d->disable_migrate )
         p->extd.itsc = cpu_has_itsc;
 
+    /*
+     * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
+     * so dom0 can turn off workarounds as appropriate.  Temporary, until the
+     * domain policy logic gains a better understanding of MSRs.
+     */
+    if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+        p->feat.arch_caps = true;
+
     d->arch.cpuid = p;
 
     recalculate_cpuid_policy(d);
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -130,6 +130,22 @@ int init_domain_msr_policy(struct domain
     if ( !opt_dom0_cpuid_faulting && is_control_domain(d) && is_pv_domain(d) )
         mp->platform_info.cpuid_faulting = false;
 
+    /*
+     * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
+     * so dom0 can turn off workarounds as appropriate.  Temporary, until the
+     * domain policy logic gains a better understanding of MSRs.
+     */
+    if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+    {
+        uint64_t val;
+
+        rdmsrl(MSR_ARCH_CAPABILITIES, val);
+
+        mp->arch_caps.raw = val &
+            (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA |
+             ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO);
+    }
+
     d->arch.msr = mp;
 
     return 0;
++++++ 5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch ++++++
# Commit afe018e041ec112d90a8b4e6ed607d22aa06f280
# Date 2020-08-31 14:21:46 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Fix multiple bugs with SEGBASE_GS_USER_SEL

The logic takes the segment selector unmodified from guest context.  This
allowed the guest to load DPL0 descriptors into %gs.  Fix up the RPL for
non-NUL selectors to be 3.

Xen's context switch logic skips saving the inactive %gs base, as it cannot be
modified by the guest behind Xen's back.  This depends on Xen caching updates
to the inactive base, which is was missing from this path.

The consequence is that, following SEGBASE_GS_USER_SEL, the next context
switch will restore the stale inactive %gs base, and corrupt vcpu state.

Rework the hypercall to update the cached idea of gs_base_user, and fix the
behaviour in the case of the AMD NUL selector bug to always zero the segment
base.

Reported-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1031,17 +1031,54 @@ long do_set_segment_base(unsigned int wh
         break;
 
     case SEGBASE_GS_USER_SEL:
-        __asm__ __volatile__ (
-            "     swapgs              \n"
-            "1:   movl %k0,%%gs       \n"
-            "    "safe_swapgs"        \n"
-            ".section .fixup,\"ax\"   \n"
-            "2:   xorl %k0,%k0        \n"
-            "     jmp  1b             \n"
-            ".previous                \n"
-            _ASM_EXTABLE(1b, 2b)
-            : "+r" (base) );
+    {
+        unsigned int sel = (uint16_t)base;
+
+        /*
+         * We wish to update the user %gs from the GDT/LDT.  Currently, the
+         * guest kernel's GS_BASE is in context.
+         */
+        asm volatile ( "swapgs" );
+
+        if ( sel > 3 )
+            /* Fix up RPL for non-NUL selectors. */
+            sel |= 3;
+        else if ( boot_cpu_data.x86_vendor &
+                  (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+            /* Work around NUL segment behaviour on AMD hardware. */
+            asm volatile ( "mov %[sel], %%gs"
+                           :: [sel] "r" (FLAT_USER_DS32) );
+
+        /*
+         * Load the chosen selector, with fault handling.
+         *
+         * Errors ought to fail the hypercall, but that was never built in
+         * originally, and Linux will BUG() if this call fails.
+         *
+         * NUL the selector in the case of an error.  This too needs to deal
+         * with the AMD NUL segment behaviour, but it is already a slowpath in
+         * #GP context so perform the flat load unconditionally to avoid
+         * complicated logic.
+         *
+         * Anyone wanting to check for errors from this hypercall should
+         * re-read %gs and compare against the input.
+         */
+        asm volatile ( "1: mov %[sel], %%gs\n\t"
+                       ".section .fixup, \"ax\", @progbits\n\t"
+                       "2: mov %k[flat], %%gs\n\t"
+                       "   xor %[sel], %[sel]\n\t"
+                       "   jmp 1b\n\t"
+                       ".previous\n\t"
+                       _ASM_EXTABLE(1b, 2b)
+                       : [sel] "+r" (sel)
+                       : [flat] "r" (FLAT_USER_DS32) );
+
+        /* Update the cache of the inactive base, as read from the GDT/LDT. */
+        v->arch.pv.gs_base_user = rdgsbase();
+
+        asm volatile ( safe_swapgs );
         break;
+    }
 
     default:
         ret = -EINVAL;
++++++ 5f560c42-x86-PV-64bit-segbase-consistency.patch ++++++
# Commit a5eaac9245f4f382a3cd0e9710e9d1cba7db20e4
# Date 2020-09-07 11:32:34 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Fix consistency of 64bit segment bases

The comments in save_segments(), _toggle_guest_pt() and write_cr() are false.
The %fs and %gs bases can be updated at any time by the guest.

As a consequence, Xen's fs_base/etc tracking state is always stale when the
vcpu is in context, and must not be used to complete MSR_{FS,GS}_BASE reads, etc.

In particular, a sequence such as:

  wrmsr(MSR_FS_BASE, 0x1ull << 32);
  write_fs(__USER_DS);
  base = rdmsr(MSR_FS_BASE);

will return the stale base, not the new base.  This may cause guest a guest
kernel's context switching of userspace to malfunction.

Therefore:
 * Update save_segments(), _toggle_guest_pt() and read_msr() to always read
   the segment bases from hardware.
 * Update write_cr(), write_msr() and do_set_segment_base() to not not waste
   time caching data which is instantly going to become stale again.
 * Provide comments explaining when the tracking state is and isn't stale.

This bug has been present for 14 years, but several bugfixes since have built
on and extended the original flawed logic.

Fixes: ba9adb737ba ("Apply stricter checking to RDMSR/WRMSR emulations.")
Fixes: c42494acb2f ("x86: fix FS/GS base handling when using the fsgsbase feature")
Fixed: eccc170053e ("x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1562,6 +1562,16 @@ static void load_segments(struct vcpu *n
     }
 }
 
+/*
+ * Record all guest segment state.  The guest can load segment selectors
+ * without trapping, which will also alter the 64bit FS/GS bases.  Arbitrary
+ * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
+ * enabled.
+ *
+ * Guests however cannot use SWAPGS, so there is no mechanism to modify the
+ * inactive GS base behind Xen's back.  Therefore, Xen's copy of the inactive
+ * GS base is still accurate, and doesn't need reading back from hardware.
+ */
 static void save_segments(struct vcpu *v)
 {
     struct cpu_user_regs *regs = &v->arch.user_regs;
@@ -1572,14 +1582,15 @@ static void save_segments(struct vcpu *v
     regs->fs = read_sreg(fs);
     regs->gs = read_sreg(gs);
 
-    /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
-    if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) )
+    if ( !is_pv_32bit_vcpu(v) )
     {
-        v->arch.pv.fs_base = __rdfsbase();
+        unsigned long gs_base = rdgsbase();
+
+        v->arch.pv.fs_base = rdfsbase();
         if ( v->arch.flags & TF_kernel_mode )
-            v->arch.pv.gs_base_kernel = __rdgsbase();
+            v->arch.pv.gs_base_kernel = gs_base;
         else
-            v->arch.pv.gs_base_user = __rdgsbase();
+            v->arch.pv.gs_base_user = gs_base;
     }
 
     if ( regs->ds )
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -444,17 +444,19 @@ static void _toggle_guest_pt(struct vcpu
 void toggle_guest_mode(struct vcpu *v)
 {
     const struct domain *d = v->domain;
+    unsigned long gs_base;
 
     ASSERT(!is_pv_32bit_vcpu(v));
 
-    /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
-    if ( read_cr4() & X86_CR4_FSGSBASE )
-    {
-        if ( v->arch.flags & TF_kernel_mode )
-            v->arch.pv.gs_base_kernel = __rdgsbase();
-        else
-            v->arch.pv.gs_base_user = __rdgsbase();
-    }
+    /*
+     * Update the cached value of the GS base about to become inactive, as a
+     * subsequent context switch won't bother re-reading it.
+     */
+    gs_base = rdgsbase();
+    if ( v->arch.flags & TF_kernel_mode )
+        v->arch.pv.gs_base_kernel = gs_base;
+    else
+        v->arch.pv.gs_base_user = gs_base;
     asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -801,17 +801,6 @@ static int write_cr(unsigned int reg, un
     }
 
     case 4: /* Write CR4 */
-        /*
-         * If this write will disable FSGSBASE, refresh Xen's idea of the
-         * guest bases now that they can no longer change.
-         */
-        if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) &&
-             !(val & X86_CR4_FSGSBASE) )
-        {
-            curr->arch.pv.fs_base = __rdfsbase();
-            curr->arch.pv.gs_base_kernel = __rdgsbase();
-        }
-
         curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
         write_cr4(pv_make_cr4(curr));
         ctxt_switch_levelling(curr);
@@ -860,15 +849,13 @@ static int read_msr(unsigned int reg, ui
     case MSR_FS_BASE:
         if ( is_pv_32bit_domain(currd) )
             break;
-        *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
-                                               : curr->arch.pv.fs_base;
+        *val = rdfsbase();
         return X86EMUL_OKAY;
 
     case MSR_GS_BASE:
         if ( is_pv_32bit_domain(currd) )
             break;
-        *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
-                                               : curr->arch.pv.gs_base_kernel;
+        *val = rdgsbase();
         return X86EMUL_OKAY;
 
     case MSR_SHADOW_GS_BASE:
@@ -997,14 +984,12 @@ static int write_msr(unsigned int reg, u
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
             break;
         wrfsbase(val);
-        curr->arch.pv.fs_base = val;
         return X86EMUL_OKAY;
 
     case MSR_GS_BASE:
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
             break;
         wrgsbase(val);
-        curr->arch.pv.gs_base_kernel = val;
         return X86EMUL_OKAY;
 
     case MSR_SHADOW_GS_BASE:
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1002,10 +1002,7 @@ long do_set_segment_base(unsigned int wh
     {
     case SEGBASE_FS:
         if ( is_canonical_address(base) )
-        {
             wrfsbase(base);
-            v->arch.pv.fs_base = base;
-        }
         else
             ret = -EINVAL;
         break;
@@ -1022,10 +1019,7 @@ long do_set_segment_base(unsigned int wh
 
     case SEGBASE_GS_KERNEL:
         if ( is_canonical_address(base) )
-        {
             wrgsbase(base);
-            v->arch.pv.gs_base_kernel = base;
-        }
         else
             ret = -EINVAL;
         break;
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -516,7 +516,24 @@ struct pv_vcpu
     bool_t syscall32_disables_events;
     bool_t sysenter_disables_events;
 
-    /* Segment base addresses. */
+    /*
+     * 64bit segment bases.
+     *
+     * FS and the active GS are always stale when the vCPU is in context, as
+     * the guest can change them behind Xen's back with MOV SREG, or
+     * WR{FS,GS}BASE on capable hardware.
+     *
+     * The inactive GS base is never stale, as guests can't use SWAPGS to
+     * access it - all modification is performed by Xen either directly
+     * (hypercall, #GP emulation), or indirectly (toggle_guest_mode()).
+     *
+     * The vCPU context switch path is optimised based on this fact, so any
+     * path updating or swapping the inactive base must update the cached
+     * value as well.
+     *
+     * Which GS base is active and inactive depends on whether the vCPU is in
+     * user or kernel context.
+     */
     unsigned long fs_base;
     unsigned long gs_base_kernel;
     unsigned long gs_base_user;
++++++ 5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch ++++++
# Commit ad0fd291c5e79191c2e3c70e43dded569f11a450
# Date 2020-09-07 11:32:34 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Rewrite segment context switching from scratch

There are multiple bugs with the existing implementation.

On AMD CPUs prior to Zen2, loading a NUL segment selector doesn't clear the
segment base, which is a problem for 64bit code which typically expects to use
a NUL %fs/%gs selector.

On a context switch from any PV vcpu, to a 64bit PV vcpu with an %fs/%gs
selector which faults, the fixup logic loads NUL, and the guest is entered at
the failsafe callback with the stale base.

Alternatively, a PV context switch sequence of 64 (NUL, non-zero base) =>
32 (NUL) => 64 (NUL, zero base) will similarly cause Xen to enter the guest
with a stale base.

Both of these corner cases manifest as state corruption in the final vcpu.
However, damage is limited to to 64bit code expecting to use Thread Local
Storage with a base pointer of 0, which doesn't occur by default.

The context switch logic is extremely complicated, and is attempting to
optimise away loading a NUL selector (which is fast), or writing a 64bit base
of 0 (which is rare).  Furthermore, it fails to respect Linux's ABI with
userspace, which manifests as userspace state corruption as far as Linux is
concerned.

Always restore all selector and base state, in all cases.

Leave a large comment explaining hardware behaviour, and the new ABI
expectations.  Update the comments in the public headers.

Drop all "segment preloading" to handle the AMD corner case.  It was never
anything but a waste of time for %ds/%es, and isn't needed now that %fs/%gs
bases are unconditionally written for 64bit PV guests.  In load_segments(),
store the result of is_pv_32bit_vcpu() as it is an expensive predicate now,
and not used in a way which impacts speculative safety.

Reported-by: Andy Lutomirski <luto@kernel.org>
Reported-by: Sarah Newman <srn@prgmr.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

# Commit 1e2d3be2e516e6f415ca6029f651b76a8563a27c
# Date 2020-09-08 16:46:31 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Fix assertions in svm_load_segs()

OSSTest has shown an assertion failure:
http://logs.test-lab.xenproject.org/osstest/logs/153906/test-xtf-amd64-amd64...

This is because we pass a non-NUL selector into svm_load_segs(), which is
something we must not do, as this path does not load the attributes/limits
from the GDT/LDT.

Drop the {fs,gs}_sel parameters from svm_load_segs() and use 0 instead.  This
is acceptable even for non-zero NUL segments, as it is how the IRET
instruction behaves in all CPUs.

Only use the svm_load_segs() path when both FS and GS are NUL, which is the
common case when scheduling a 64bit vcpu with 64bit userspace in context.

Fixes: ad0fd291c5 ("x86/pv: Rewrite segment context switching from scratch")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1017,13 +1017,9 @@ int arch_set_info_guest(
     if ( !compat )
     {
         v->arch.pv.syscall_callback_eip = c.nat->syscall_callback_eip;
-        /* non-nul selector kills fs_base */
-        v->arch.pv.fs_base =
-            !(v->arch.user_regs.fs & ~3) ? c.nat->fs_base : 0;
+        v->arch.pv.fs_base = c.nat->fs_base;
         v->arch.pv.gs_base_kernel = c.nat->gs_base_kernel;
-        /* non-nul selector kills gs_base_user */
-        v->arch.pv.gs_base_user =
-            !(v->arch.user_regs.gs & ~3) ? c.nat->gs_base_user : 0;
+        v->arch.pv.gs_base_user = c.nat->gs_base_user;
     }
     else
     {
@@ -1342,58 +1338,60 @@ arch_do_vcpu_op(
 }
 
 /*
- * Loading a nul selector does not clear bases and limits on AMD or Hygon
- * CPUs. Be on the safe side and re-initialize both to flat segment values
- * before loading a nul selector.
- */
-#define preload_segment(seg, value) do {              \
-    if ( !((value) & ~3) &&                           \
-         (boot_cpu_data.x86_vendor &                  \
-          (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )      \
-        asm volatile ( "movl %k0, %%" #seg            \
-                       :: "r" (FLAT_USER_DS32) );     \
-} while ( false )
-
-#define loadsegment(seg,value) ({               \
-    int __r = 1;                                \
-    asm volatile (                              \
-        "1: movl %k1,%%" #seg "\n2:\n"          \
-        ".section .fixup,\"ax\"\n"              \
-        "3: xorl %k0,%k0\n"                     \
-        "   movl %k0,%%" #seg "\n"              \
-        "   jmp 2b\n"                           \
-        ".previous\n"                           \
-        _ASM_EXTABLE(1b, 3b)                    \
-        : "=r" (__r) : "r" (value), "0" (__r) );\
-    __r; })
-
-/*
- * save_segments() writes a mask of segments which are dirty (non-zero),
- * allowing load_segments() to avoid some expensive segment loads and
- * MSR writes.
+ * Notes on PV segment handling:
+ *  - 32bit: All data from the GDT/LDT.
+ *  - 64bit: In addition, 64bit FS/GS/GS_KERN bases.
+ *
+ * Linux's ABI with userspace expects to preserve the full selector and
+ * segment base, even sel != NUL, base != GDT/LDT for 64bit code.  Xen must
+ * honour this when context switching, to avoid breaking Linux's ABI.
+ *
+ * Note: It is impossible to preserve a selector value of 1, 2 or 3, as these
+ *       get reset to 0 by an IRET back to guest context.  Code playing with
+ *       arcane corners of x86 get to keep all resulting pieces.
+ *
+ * Therefore, we:
+ *  - Load the LDT.
+ *  - Load each segment selector.
+ *    - Any error loads zero, and triggers a failsafe callback.
+ *  - For 64bit, further load the 64bit bases.
+ *
+ * An optimisation exists on SVM-capable hardware, where we use a VMLOAD
+ * instruction to load the LDT and full FS/GS/GS_KERN data in one go.
+ *
+ * AMD-like CPUs prior to Zen2 do not zero the segment base or limit when
+ * loading a NUL selector.  This is a problem in principle when context
+ * switching to a 64bit guest, as a NUL FS/GS segment is usable and will pick
+ * up the stale base.
+ *
+ * However, it is not an issue in practice.  NUL segments are unusable for
+ * 32bit guests (so any stale base won't be used), and we unconditionally
+ * write the full FS/GS bases for 64bit guests.
  */
-static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
-#define DIRTY_DS           0x01
-#define DIRTY_ES           0x02
-#define DIRTY_FS           0x04
-#define DIRTY_GS           0x08
-#define DIRTY_FS_BASE      0x10
-#define DIRTY_GS_BASE      0x20
-
 static void load_segments(struct vcpu *n)
 {
     struct cpu_user_regs *uregs = &n->arch.user_regs;
-    int all_segs_okay = 1;
-    unsigned int dirty_segment_mask, cpu = smp_processor_id();
-    bool fs_gs_done = false;
-
-    /* Load and clear the dirty segment mask. */
-    dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
-    per_cpu(dirty_segment_mask, cpu) = 0;
+    bool compat = is_pv_32bit_vcpu(n);
+    bool all_segs_okay = true, fs_gs_done = false;
+
+    /*
+     * Attempt to load @seg with selector @val.  On error, clear
+     * @all_segs_okay in function scope, and load NUL into @sel.
+     */
+#define TRY_LOAD_SEG(seg, val)                          \
+    asm volatile ( "1: mov %k[_val], %%" #seg "\n\t"    \
+                   "2:\n\t"                             \
+                   ".section .fixup, \"ax\"\n\t"        \
+                   "3: xor %k[ok], %k[ok]\n\t"          \
+                   "   mov %k[ok], %%" #seg "\n\t"      \
+                   "   jmp 2b\n\t"                      \
+                   ".previous\n\t"                      \
+                   _ASM_EXTABLE(1b, 3b)                 \
+                   : [ok] "+r" (all_segs_okay)          \
+                   : [_val] "rm" (val) )
 
 #ifdef CONFIG_HVM
-    if ( cpu_has_svm && !is_pv_32bit_vcpu(n) &&
-         !(read_cr4() & X86_CR4_FSGSBASE) && !((uregs->fs | uregs->gs) & ~3) )
+    if ( cpu_has_svm && !compat && (uregs->fs | uregs->gs) <= 3 )
     {
         unsigned long gsb = n->arch.flags & TF_kernel_mode
             ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
@@ -1401,62 +1399,25 @@ static void load_segments(struct vcpu *n
             ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
 
         fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
-                                   uregs->fs, n->arch.pv.fs_base,
-                                   uregs->gs, gsb, gss);
+                                   n->arch.pv.fs_base, gsb, gss);
     }
 #endif
     if ( !fs_gs_done )
-        load_LDT(n);
-
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
-    {
-        preload_segment(ds, uregs->ds);
-        all_segs_okay &= loadsegment(ds, uregs->ds);
-    }
-
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) )
     {
-        preload_segment(es, uregs->es);
-        all_segs_okay &= loadsegment(es, uregs->es);
-    }
+        load_LDT(n);
 
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done )
-    {
-        all_segs_okay &= loadsegment(fs, uregs->fs);
-        /* non-nul selector updates fs_base */
-        if ( uregs->fs & ~3 )
-            dirty_segment_mask &= ~DIRTY_FS_BASE;
+        TRY_LOAD_SEG(fs, uregs->fs);
+        TRY_LOAD_SEG(gs, uregs->gs);
     }
 
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done )
-    {
-        all_segs_okay &= loadsegment(gs, uregs->gs);
-        /* non-nul selector updates gs_base_user */
-        if ( uregs->gs & ~3 )
-            dirty_segment_mask &= ~DIRTY_GS_BASE;
-    }
+    TRY_LOAD_SEG(ds, uregs->ds);
+    TRY_LOAD_SEG(es, uregs->es);
 
-    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
+    if ( !fs_gs_done && !compat )
     {
-        /* This can only be non-zero if selector is NULL. */
-        if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
-            wrfsbase(n->arch.pv.fs_base);
-
-        /*
-         * Most kernels have non-zero GS base, so don't bother testing.
-         * (For old AMD hardware this is also a serialising instruction,
-         * avoiding erratum #88.)
-         */
+        wrfsbase(n->arch.pv.fs_base);
         wrgsshadow(n->arch.pv.gs_base_kernel);
-
-        /* This can only be non-zero if selector is NULL. */
-        if ( n->arch.pv.gs_base_user |
-             (dirty_segment_mask & DIRTY_GS_BASE) )
-            wrgsbase(n->arch.pv.gs_base_user);
+        wrgsbase(n->arch.pv.gs_base_user);
 
         /* If in kernel mode then switch the GS bases around. */
         if ( (n->arch.flags & TF_kernel_mode) )
@@ -1575,7 +1536,6 @@ static void load_segments(struct vcpu *n
 static void save_segments(struct vcpu *v)
 {
     struct cpu_user_regs *regs = &v->arch.user_regs;
-    unsigned int dirty_segment_mask = 0;
 
     regs->ds = read_sreg(ds);
     regs->es = read_sreg(es);
@@ -1592,35 +1552,6 @@ static void save_segments(struct vcpu *v
         else
             v->arch.pv.gs_base_user = gs_base;
     }
-
-    if ( regs->ds )
-        dirty_segment_mask |= DIRTY_DS;
-
-    if ( regs->es )
-        dirty_segment_mask |= DIRTY_ES;
-
-    if ( regs->fs || is_pv_32bit_vcpu(v) )
-    {
-        dirty_segment_mask |= DIRTY_FS;
-        /* non-nul selector kills fs_base */
-        if ( regs->fs & ~3 )
-            v->arch.pv.fs_base = 0;
-    }
-    if ( v->arch.pv.fs_base )
-        dirty_segment_mask |= DIRTY_FS_BASE;
-
-    if ( regs->gs || is_pv_32bit_vcpu(v) )
-    {
-        dirty_segment_mask |= DIRTY_GS;
-        /* non-nul selector kills gs_base_user */
-        if ( regs->gs & ~3 )
-            v->arch.pv.gs_base_user = 0;
-    }
-    if ( v->arch.flags & TF_kernel_mode ? v->arch.pv.gs_base_kernel
-                                        : v->arch.pv.gs_base_user )
-        dirty_segment_mask |= DIRTY_GS_BASE;
-
-    this_cpu(dirty_segment_mask) = dirty_segment_mask;
 }
 
 void paravirt_ctxt_switch_from(struct vcpu *v)
@@ -1830,8 +1761,8 @@ static void __context_switch(void)
 #if defined(CONFIG_PV) && defined(CONFIG_HVM)
     /* Prefetch the VMCB if we expect to use it later in the context switch */
     if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
-         !is_idle_domain(nd) && !(read_cr4() & X86_CR4_FSGSBASE) )
-        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
+         !is_idle_domain(nd) )
+        svm_load_segs(0, 0, 0, 0, 0);
 #endif
 
     if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1518,8 +1518,7 @@ static void svm_init_erratum_383(const s
 
 #ifdef CONFIG_PV
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
-                   unsigned int fs_sel, unsigned long fs_base,
-                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow)
 {
     unsigned int cpu = smp_processor_id();
@@ -1556,14 +1555,12 @@ bool svm_load_segs(unsigned int ldt_ents
         vmcb->ldtr.base = ldt_base;
     }
 
-    ASSERT(!(fs_sel & ~3));
-    vmcb->fs.sel = fs_sel;
+    vmcb->fs.sel = 0;
     vmcb->fs.attr = 0;
     vmcb->fs.limit = 0;
     vmcb->fs.base = fs_base;
 
-    ASSERT(!(gs_sel & ~3));
-    vmcb->gs.sel = gs_sel;
+    vmcb->gs.sel = 0;
     vmcb->gs.attr = 0;
     vmcb->gs.limit = 0;
     vmcb->gs.base = gs_base;
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -52,10 +52,12 @@ void svm_update_guest_cr(struct vcpu *,
 /*
  * PV context switch helper. Calls with zero ldt_base request a prefetch of
  * the VMCB area to be loaded from, instead of an actual load of state.
+ *
+ * Must only be used for NUL FS/GS, as the segment attributes/limits are not
+ * read from the GDT/LDT.
  */
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
-                   unsigned int fs_sel, unsigned long fs_base,
-                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow);
 
 extern u32 svm_feature_flags;
--- a/xen/include/public/arch-x86/xen-x86_64.h
+++ b/xen/include/public/arch-x86/xen-x86_64.h
@@ -203,8 +203,8 @@ struct cpu_user_regs {
     uint16_t ss, _pad2[3];
     uint16_t es, _pad3[3];
     uint16_t ds, _pad4[3];
-    uint16_t fs, _pad5[3]; /* Non-nul => takes precedence over fs_base.      */
-    uint16_t gs, _pad6[3]; /* Non-nul => takes precedence over gs_base_user. */
+    uint16_t fs, _pad5[3];
+    uint16_t gs, _pad6[3];
 };
 typedef struct cpu_user_regs cpu_user_regs_t;
 DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
++++++ 5f5b6b7a-hypfs-fix-custom-param-writes.patch ++++++
# Commit b4e41b1750d550bf2b1ccf97ee46f4f682bdbb62
# Date 2020-09-11 14:20:10 +0200
# Author Juergen Gross <jgross@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
xen/hypfs: fix writing of custom parameter

Today the maximum allowed data length for writing a hypfs node is
tested in the generic hypfs_write() function. For custom runtime
parameters this might be wrong, as the maximum allowed size is derived
from the buffer holding the current setting, while there might be ways
to set the parameter needing more characters than the minimal
representation of that value.

One example for this is the "ept" parameter. Its value buffer is sized
to be able to hold the string "exec-sp=0" or "exec-sp=1", while it is
allowed to use e.g. "no-exec-sp" or "exec-sp=yes" for setting it.

Fix that by moving the length check one level down to the type
specific write function.

In order to avoid allocation of arbitrary sized buffers use a new
MAX_PARAM_SIZE macro as an upper limit for custom writes. The value
of MAX_PARAM_SIZE is the same as the limit in parse_params() for a
single parameter.

Fixes: a659d7cab9af ("xen: add runtime parameter access support to hypfs")
Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/common/hypfs.c
+++ b/xen/common/hypfs.c
@@ -297,7 +297,9 @@ int hypfs_write_leaf(struct hypfs_entry_
     int ret;
 
     ASSERT(this_cpu(hypfs_locked) == hypfs_write_locked);
-    ASSERT(ulen <= leaf->e.max_size);
+
+    if ( ulen > leaf->e.max_size )
+        return -ENOSPC;
 
     if ( leaf->e.type != XEN_HYPFS_TYPE_STRING &&
          leaf->e.type != XEN_HYPFS_TYPE_BLOB && ulen != leaf->e.size )
@@ -356,6 +358,10 @@ int hypfs_write_custom(struct hypfs_entr
 
     ASSERT(this_cpu(hypfs_locked) == hypfs_write_locked);
 
+    /* Avoid oversized buffer allocation. */
+    if ( ulen > MAX_PARAM_SIZE )
+        return -ENOSPC;
+
     buf = xzalloc_array(char, ulen);
     if ( !buf )
         return -ENOMEM;
@@ -386,9 +392,6 @@ static int hypfs_write(struct hypfs_entr
 
     ASSERT(entry->max_size);
 
-    if ( ulen > entry->max_size )
-        return -ENOSPC;
-
     l = container_of(entry, struct hypfs_entry_leaf, e);
 
     return entry->write(l, uaddr, ulen);
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -57,7 +57,7 @@ static int assign_integer_param(const st
 static int parse_params(const char *cmdline, const struct kernel_param *start,
                         const struct kernel_param *end)
 {
-    char opt[128], *optval, *optkey, *q;
+    char opt[MAX_PARAM_SIZE], *optval, *optkey, *q;
     const char *p = cmdline, *key;
     const struct kernel_param *param;
     int rc, final_rc = 0;
--- a/xen/include/xen/param.h
+++ b/xen/include/xen/param.h
@@ -26,6 +26,9 @@ struct kernel_param {
     } par;
 };
 
+/* Maximum length of a single parameter string. */
+#define MAX_PARAM_SIZE 128
+
 extern const struct kernel_param __setup_start[], __setup_end[];
 
 #define __param(att)      static const att \
++++++ 5f607915-x86-HVM-more-consistent-IO-completion.patch ++++++
# Commit b807cfe954b8d0d8852398b4c8a586d95d69a342
# Date 2020-09-15 10:19:33 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/HVM: more consistently set I/O completion

Doing this just in hvm_emulate_one_insn() is not enough.
hvm_ud_intercept() and hvm_emulate_one_vm_event() can get invoked for
insns requiring one or more continuations, and at least in principle
hvm_emulate_one_mmio() could, too. Without proper setting of the field,
handle_hvm_io_completion() will do nothing completion-wise, and in
particular the missing re-invocation of the insn emulation paths will
lead to emulation caching not getting disabled in due course, causing
the ASSERT() in {svm,vmx}_vmenter_helper() to trigger.

Reported-by: Don Slutz <don.slutz@gmail.com>

Similar considerations go for the clearing of vio->mmio_access, which
gets moved as well.

Additionally all updating of vio->mmio_* now gets done dependent upon
the new completion value, rather than hvm_ioreq_needs_completion()'s
return value. This is because it is the completion chosen which controls
what path will be taken when handling the completion, not the simple
boolean return value. In particular, PIO completion doesn't involve
going through the insn emulator, and hence emulator state ought to get
cleared early (or it won't get cleared at all).

The new logic, besides allowing for a caller override for the
continuation type to be set (for VMX real mode emulation), will also
avoid setting an MMIO completion when a simpler PIO one will do. This
is a minor optimization only as a side effect - the behavior is strictly
needed at least for hvm_ud_intercept(), as only memory accesses can
successfully complete through handle_mmio(). Care of course needs to be
taken to correctly deal with "mixed" insns (doing both MMIO and PIO at
the same time, i.e. INS/OUTS). For this, hvmemul_validate() now latches
whether the insn being emulated is a memory access, as this information
is no longer easily available at the point where we want to consume it.

Note that the presence of non-NULL .validate fields in the two ops
structures in hvm_emulate_one_mmio() was really necessary even before
the changes here: Without this, passing non-NULL as middle argument to
hvm_emulate_init_once() is meaningless.

The restrictions on when the #UD intercept gets actually enabled are why
it was decided that this is not a security issue:
- the "hvm_fep" option to enable its use is a debugging option only,
- for the cross-vendor case is considered experimental, even if
  unfortunately SUPPORT.md doesn't have an explicit statement about
  this.
The other two affected functions are
- hvm_emulate_one_vm_event(), used for introspection,
- hvm_emulate_one_mmio(), used for Dom0 only,
which aren't qualifying this as needing an XSA either.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Don Slutz <don.slutz@gmail.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1683,9 +1683,11 @@ static int hvmemul_validate(
     const struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt)
 {
-    const struct hvm_emulate_ctxt *hvmemul_ctxt =
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
 
+    hvmemul_ctxt->is_mem_access = x86_insn_is_mem_access(state, ctxt);
+
     return !hvmemul_ctxt->validate || hvmemul_ctxt->validate(state, ctxt)
            ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
 }
@@ -2610,8 +2612,14 @@ static const struct x86_emulate_ops hvm_
     .vmfunc        = hvmemul_vmfunc,
 };
 
+/*
+ * Note that passing HVMIO_no_completion into this function serves as kind
+ * of (but not fully) an "auto select completion" indicator.  When there's
+ * no completion needed, the passed in value will be ignored in any case.
+ */
 static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
-    const struct x86_emulate_ops *ops)
+    const struct x86_emulate_ops *ops,
+    enum hvm_io_completion completion)
 {
     const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs;
     struct vcpu *curr = current;
@@ -2642,16 +2650,31 @@ static int _hvm_emulate_one(struct hvm_e
         rc = X86EMUL_RETRY;
 
     if ( !hvm_ioreq_needs_completion(&vio->io_req) )
+        completion = HVMIO_no_completion;
+    else if ( completion == HVMIO_no_completion )
+        completion = (vio->io_req.type != IOREQ_TYPE_PIO ||
+                      hvmemul_ctxt->is_mem_access) ? HVMIO_mmio_completion
+                                                   : HVMIO_pio_completion;
+
+    switch ( vio->io_completion = completion )
     {
+    case HVMIO_no_completion:
+    case HVMIO_pio_completion:
         vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
+        vio->mmio_access = (struct npfec){};
         hvmemul_cache_disable(curr);
-    }
-    else
-    {
+        break;
+
+    case HVMIO_mmio_completion:
+    case HVMIO_realmode_completion:
         BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
         vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
         memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
     }
 
     if ( hvmemul_ctxt->ctxt.retire.singlestep )
@@ -2692,9 +2715,10 @@ static int _hvm_emulate_one(struct hvm_e
 }
 
 int hvm_emulate_one(
-    struct hvm_emulate_ctxt *hvmemul_ctxt)
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    enum hvm_io_completion completion)
 {
-    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops);
+    return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops, completion);
 }
 
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla)
@@ -2703,11 +2727,13 @@ int hvm_emulate_one_mmio(unsigned long m
         .read       = x86emul_unhandleable_rw,
         .insn_fetch = hvmemul_insn_fetch,
         .write      = mmcfg_intercept_write,
+        .validate   = hvmemul_validate,
     };
     static const struct x86_emulate_ops hvm_ro_emulate_ops_mmio = {
         .read       = x86emul_unhandleable_rw,
         .insn_fetch = hvmemul_insn_fetch,
         .write      = mmio_ro_emulated_write,
+        .validate   = hvmemul_validate,
     };
     struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla };
     struct hvm_emulate_ctxt ctxt;
@@ -2727,8 +2753,8 @@ int hvm_emulate_one_mmio(unsigned long m
     hvm_emulate_init_once(&ctxt, x86_insn_is_mem_write,
                           guest_cpu_user_regs());
     ctxt.ctxt.data = &mmio_ro_ctxt;
-    rc = _hvm_emulate_one(&ctxt, ops);
-    switch ( rc )
+
+    switch ( rc = _hvm_emulate_one(&ctxt, ops, HVMIO_no_completion) )
     {
     case X86EMUL_UNHANDLEABLE:
     case X86EMUL_UNIMPLEMENTED:
@@ -2755,7 +2781,8 @@ void hvm_emulate_one_vm_event(enum emul_
     switch ( kind )
     {
     case EMUL_KIND_NOWRITE:
-        rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write);
+        rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write,
+                              HVMIO_no_completion);
         break;
     case EMUL_KIND_SET_CONTEXT_INSN: {
         struct vcpu *curr = current;
@@ -2776,7 +2803,7 @@ void hvm_emulate_one_vm_event(enum emul_
     /* Fall-through */
     default:
         ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA);
-        rc = hvm_emulate_one(&ctx);
+        rc = hvm_emulate_one(&ctx, HVMIO_no_completion);
     }
 
     switch ( rc )
@@ -2874,6 +2901,8 @@ void hvm_emulate_init_per_insn(
                                         pfec, NULL) == HVMTRANS_okay) ?
             sizeof(hvmemul_ctxt->insn_buf) : 0;
     }
+
+    hvmemul_ctxt->is_mem_access = false;
 }
 
 void hvm_emulate_writeback(
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3810,7 +3810,7 @@ void hvm_ud_intercept(struct cpu_user_re
         return;
     }
 
-    switch ( hvm_emulate_one(&ctxt) )
+    switch ( hvm_emulate_one(&ctxt, HVMIO_no_completion) )
     {
     case X86EMUL_UNHANDLEABLE:
     case X86EMUL_UNIMPLEMENTED:
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -81,20 +81,11 @@ void send_invalidate_req(void)
 bool hvm_emulate_one_insn(hvm_emulate_validate_t *validate, const char *descr)
 {
     struct hvm_emulate_ctxt ctxt;
-    struct vcpu *curr = current;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     int rc;
 
     hvm_emulate_init_once(&ctxt, validate, guest_cpu_user_regs());
 
-    rc = hvm_emulate_one(&ctxt);
-
-    if ( hvm_ioreq_needs_completion(&vio->io_req) )
-        vio->io_completion = HVMIO_mmio_completion;
-    else
-        vio->mmio_access = (struct npfec){};
-
-    switch ( rc )
+    switch ( rc = hvm_emulate_one(&ctxt, HVMIO_no_completion) )
     {
     case X86EMUL_UNHANDLEABLE:
         hvm_dump_emulation_state(XENLOG_G_WARNING, descr, &ctxt, rc);
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -97,15 +97,11 @@ static void realmode_deliver_exception(
 void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
-    struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     int rc;
 
     perfc_incr(realmode_emulations);
 
-    rc = hvm_emulate_one(hvmemul_ctxt);
-
-    if ( hvm_ioreq_needs_completion(&vio->io_req) )
-        vio->io_completion = HVMIO_realmode_completion;
+    rc = hvm_emulate_one(hvmemul_ctxt, HVMIO_realmode_completion);
 
     if ( rc == X86EMUL_UNHANDLEABLE )
     {
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -48,6 +48,8 @@ struct hvm_emulate_ctxt {
 
     uint32_t intr_shadow;
 
+    bool is_mem_access;
+
     bool_t set_context;
 };
 
@@ -62,7 +64,8 @@ bool __nonnull(1, 2) hvm_emulate_one_ins
     hvm_emulate_validate_t *validate,
     const char *descr);
 int hvm_emulate_one(
-    struct hvm_emulate_ctxt *hvmemul_ctxt);
+    struct hvm_emulate_ctxt *hvmemul_ctxt,
+    enum hvm_io_completion completion);
 void hvm_emulate_one_vm_event(enum emul_kind kind,
     unsigned int trapnr,
     unsigned int errcode);
++++++ 5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch ++++++
# Commit e71301ecd50f2d3bd1b960bbf7dcf850d02e7e8a
# Date 2020-09-22 15:46:21 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly

This MSR doesn't exist on AMD hardware, and switching away from the safe
functions in the common MSR path was an erroneous change.

Partially revert the change.

This is XSA-333.

Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wl@xen.org>

--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -913,7 +913,8 @@ static int read_msr(unsigned int reg, ui
         return X86EMUL_OKAY;
 
     case MSR_IA32_MISC_ENABLE:
-        rdmsrl(reg, *val);
+        if ( rdmsr_safe(reg, *val) )
+            break;
         *val = guest_misc_enable(*val);
         return X86EMUL_OKAY;
 
@@ -1053,7 +1054,8 @@ static int write_msr(unsigned int reg, u
         break;
 
     case MSR_IA32_MISC_ENABLE:
-        rdmsrl(reg, temp);
+        if ( rdmsr_safe(reg, temp) )
+            break;
         if ( val != guest_misc_enable(temp) )
             goto invalid;
         return X86EMUL_OKAY;
++++++ 5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch ++++++
# Commit 42317dede5be4fe5be27c873f7a3f94d3bba98e0
# Date 2020-09-22 15:46:49 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
xen/memory: Don't skip the RCU unlock path in acquire_resource()

In the case that an HVM Stubdomain makes an XENMEM_acquire_resource hypercall,
the FIXME path will bypass rcu_unlock_domain() on the way out of the function.

Move the check to the start of the function.  This does change the behaviour
of the get-size path for HVM Stubdomains, but that functionality is currently
broken and unused anyway, as well as being quite useless to entities which
can't actually map the resource anyway.

This is XSA-334.

Fixes: 83fa6552ce ("common: add a new mappable resource type: XENMEM_resource_grant_table")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1058,6 +1058,14 @@ static int acquire_resource(
     xen_pfn_t mfn_list[32];
     int rc;
 
+    /*
+     * FIXME: Until foreign pages inserted into the P2M are properly
+     *        reference counted, it is unsafe to allow mapping of
+     *        resource pages unless the caller is the hardware domain.
+     */
+    if ( paging_mode_translate(currd) && !is_hardware_domain(currd) )
+        return -EACCES;
+
     if ( copy_from_guest(&xmar, arg, 1) )
         return -EFAULT;
 
@@ -1114,14 +1122,6 @@ static int acquire_resource(
         xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)];
         unsigned int i;
 
-        /*
-         * FIXME: Until foreign pages inserted into the P2M are properly
-         *        reference counted, it is unsafe to allow mapping of
-         *        resource pages unless the caller is the hardware domain.
-         */
-        if ( !is_hardware_domain(currd) )
-            return -EACCES;
-
         if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) )
             rc = -EFAULT;
 
++++++ 5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch ++++++
# Commit 8e76aef72820435e766c7f339ed36da33da90c40
# Date 2020-09-22 15:47:19 +0200
# Author Roger Pau Monné <roger.pau@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/vpt: fix race when migrating timers between vCPUs

The current vPT code will migrate the emulated timers between vCPUs
(change the pt->vcpu field) while just holding the destination lock,
either from create_periodic_time or pt_adjust_global_vcpu_target if
the global target is adjusted. Changing the periodic_timer vCPU field
in this way creates a race where a third party could grab the lock in
the unlocked region of pt_adjust_global_vcpu_target (or before
create_periodic_time performs the vcpu change) and then release the
lock from a different vCPU, creating a locking imbalance.

Introduce a per-domain rwlock in order to protect periodic_time
migration between vCPU lists. Taking the lock in read mode prevents
any timer from being migrated to a different vCPU, while taking it in
write mode allows performing migration of timers across vCPUs. The
per-vcpu locks are still used to protect all the other fields from the
periodic_timer struct.

Note that such migration shouldn't happen frequently, and hence
there's no performance drop as a result of such locking.

This is XSA-336.

Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Tested-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain
     /* need link to containing domain */
     d->arch.hvm.pl_time->domain = d;
 
+    rwlock_init(&d->arch.hvm.pl_time->pt_migrate);
+
     /* Set the default IO Bitmap. */
     if ( is_hardware_domain(d) )
     {
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic
     return 1;
 }
 
-static void pt_lock(struct periodic_time *pt)
+static void pt_vcpu_lock(struct vcpu *v)
 {
-    struct vcpu *v;
+    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&v->arch.hvm.tm_lock);
+}
 
-    for ( ; ; )
-    {
-        v = pt->vcpu;
-        spin_lock(&v->arch.hvm.tm_lock);
-        if ( likely(pt->vcpu == v) )
-            break;
-        spin_unlock(&v->arch.hvm.tm_lock);
-    }
+static void pt_vcpu_unlock(struct vcpu *v)
+{
+    spin_unlock(&v->arch.hvm.tm_lock);
+    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
+}
+
+static void pt_lock(struct periodic_time *pt)
+{
+    /*
+     * We cannot use pt_vcpu_lock here, because we need to acquire the
+     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
+     * else we might be using a stale value of pt->vcpu.
+     */
+    read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&pt->vcpu->arch.hvm.tm_lock);
 }
 
 static void pt_unlock(struct periodic_time *pt)
 {
-    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+    pt_vcpu_unlock(pt->vcpu);
 }
 
 static void pt_process_missed_ticks(struct periodic_time *pt)
@@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v)
     if ( v->pause_flags & VPF_blocked )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         if ( !pt->do_not_freeze )
@@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v)
 
     pt_freeze_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void pt_restore_timer(struct vcpu *v)
@@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
     {
@@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v)
 
     pt_thaw_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 static void pt_timer_fn(void *data)
@@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v)
     int irq, pt_vector = -1;
     bool level;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     earliest_pt = NULL;
     max_lag = -1ULL;
@@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v)
 
     if ( earliest_pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return -1;
     }
 
@@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v)
     irq = earliest_pt->irq;
     level = earliest_pt->level;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     switch ( earliest_pt->source )
     {
@@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v)
                 time_cb *cb = NULL;
                 void *cb_priv;
 
-                spin_lock(&v->arch.hvm.tm_lock);
+                pt_vcpu_lock(v);
                 /* Make sure the timer is still on the list. */
                 list_for_each_entry ( pt, &v->arch.hvm.tm_list, list )
                     if ( pt == earliest_pt )
@@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v)
                         cb_priv = pt->priv;
                         break;
                     }
-                spin_unlock(&v->arch.hvm.tm_lock);
+                pt_vcpu_unlock(v);
 
                 if ( cb != NULL )
                     cb(v, cb_priv);
@@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct
     if ( intack.source == hvm_intsrc_vector )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     pt = is_pt_irq(v, intack);
     if ( pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return;
     }
 
@@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct
     cb = pt->cb;
     cb_priv = pt->priv;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     if ( cb != NULL )
         cb(v, cb_priv);
@@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         migrate_timer(&pt->timer, v->processor);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void create_periodic_time(
@@ -490,7 +499,7 @@ void create_periodic_time(
 
     destroy_periodic_time(pt);
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
 
     pt->pending_intr_nr = 0;
     pt->do_not_freeze = 0;
@@ -540,7 +549,7 @@ void create_periodic_time(
     init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
     set_timer(&pt->timer, pt->scheduled);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void destroy_periodic_time(struct periodic_time *pt)
@@ -565,30 +574,20 @@ void destroy_periodic_time(struct period
 
 static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
 {
-    int on_list;
-
     ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
 
     if ( pt->vcpu == NULL )
         return;
 
-    pt_lock(pt);
-    on_list = pt->on_list;
-    if ( pt->on_list )
-        list_del(&pt->list);
-    pt->on_list = 0;
-    pt_unlock(pt);
-
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
     pt->vcpu = v;
-    if ( on_list )
+    if ( pt->on_list )
     {
-        pt->on_list = 1;
+        list_del(&pt->list);
         list_add(&pt->list, &v->arch.hvm.tm_list);
-
         migrate_timer(&pt->timer, v->processor);
     }
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void pt_adjust_global_vcpu_target(struct vcpu *v)
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -128,6 +128,13 @@ struct pl_time {    /* platform time */
     struct RTCState  vrtc;
     struct HPETState vhpet;
     struct PMTState  vpmt;
+    /*
+     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
+     * mode in order to prevent the vcpu field of periodic_time from changing.
+     * Lock must be taken in write mode when changes to the vcpu field are
+     * performed, as it allows exclusive access to all the timers of a domain.
+     */
+    rwlock_t pt_migrate;
     /* guest_time = Xen sys time + stime_offset */
     int64_t stime_offset;
     /* Ensures monotonicity in appropriate timer modes. */
++++++ 5f6a008e-x86-MSI-drop-read_msi_msg.patch ++++++
# Commit cb5e9730862ba0c99e81d3fbd8f65010707245e4
# Date 2020-09-22 15:47:58 +0200
# Author Roger Pau Monné <roger.pau@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/msi: get rid of read_msi_msg

It's safer and faster to just use the cached last written
(untranslated) MSI message stored in msi_desc for the single user that
calls read_msi_msg.

This also prevents relying on the data read from the device MSI
registers in order to figure out the index into the IOMMU interrupt
remapping table, which is not safe.

This is part of XSA-337.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Requested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -185,54 +185,6 @@ void msi_compose_msg(unsigned vector, co
                 MSI_DATA_VECTOR(vector);
 }
 
-static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
-{
-    switch ( entry->msi_attrib.type )
-    {
-    case PCI_CAP_ID_MSI:
-    {
-        struct pci_dev *dev = entry->dev;
-        int pos = entry->msi_attrib.pos;
-        uint16_t data;
-
-        msg->address_lo = pci_conf_read32(dev->sbdf,
-                                          msi_lower_address_reg(pos));
-        if ( entry->msi_attrib.is_64 )
-        {
-            msg->address_hi = pci_conf_read32(dev->sbdf,
-                                              msi_upper_address_reg(pos));
-            data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 1));
-        }
-        else
-        {
-            msg->address_hi = 0;
-            data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 0));
-        }
-        msg->data = data;
-        break;
-    }
-    case PCI_CAP_ID_MSIX:
-    {
-        void __iomem *base = entry->mask_base;
-
-        if ( unlikely(!msix_memory_decoded(entry->dev,
-                                           entry->msi_attrib.pos)) )
-            return false;
-        msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-        msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-        msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
-        break;
-    }
-    default:
-        BUG();
-    }
-
-    if ( iommu_intremap )
-        iommu_read_msi_from_ire(entry, msg);
-
-    return true;
-}
-
 static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
     entry->msg = *msg;
@@ -304,10 +256,7 @@ void set_msi_affinity(struct irq_desc *d
 
     ASSERT(spin_is_locked(&desc->lock));
 
-    memset(&msg, 0, sizeof(msg));
-    if ( !read_msi_msg(msi_desc, &msg) )
-        return;
-
+    msg = msi_desc->msg;
     msg.data &= ~MSI_DATA_VECTOR_MASK;
     msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
     msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
++++++ 5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch ++++++
# Commit beb54596cfdaf15f6a86d7b1bf84ca8a0b9c6b9b
# Date 2020-09-22 15:48:26 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/MSI-X: restrict reading of table/PBA bases from BARs

When assigned to less trusted or un-trusted guests, devices may change
state behind our backs (they may e.g. get reset by means we may not know
about). Therefore we should avoid reading BARs from hardware once a
device is no longer owned by Dom0. Furthermore when we can't read a BAR,
or when we read zero, we shouldn't instead use the caller provided
address unless that caller can be trusted.

Re-arrange the logic in msix_capability_init() such that only Dom0 (and
only if the device isn't DomU-owned yet) or calls through
PHYSDEVOP_prepare_msix will actually result in the reading of the
respective BAR register(s). Additionally do so only as long as in-use
table entries are known (note that invocation of PHYSDEVOP_prepare_msix
counts as a "pseudo" entry). In all other uses the value already
recorded will get used instead.

Clear the recorded values in _pci_cleanup_msix() as well as on the one
affected error path. (Adjust this error path to also avoid blindly
disabling MSI-X when it was enabled on entry to the function.)

While moving around variable declarations (in many cases to reduce their
scopes), also adjust some of their types.

This is part of XSA-337.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -771,16 +771,14 @@ static int msix_capability_init(struct p
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
-    int vf;
     u16 control;
     u64 table_paddr;
     u32 table_offset;
-    u8 bir, pbus, pslot, pfunc;
     u16 seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
-    bool maskall = msix->host_maskall;
+    bool maskall = msix->host_maskall, zap_on_error = false;
     unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
                                            PCI_CAP_ID_MSIX);
 
@@ -822,43 +820,45 @@ static int msix_capability_init(struct p
 
     /* Locate MSI-X table region */
     table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos));
-    bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
-    table_offset &= ~PCI_MSIX_BIRMASK;
+    if ( !msix->used_entries &&
+         (!msi ||
+          (is_hardware_domain(current->domain) &&
+           (dev->domain == current->domain || dev->domain == dom_io))) )
+    {
+        unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc;
+        int vf;
+        paddr_t pba_paddr;
+        unsigned int pba_offset;
 
-    if ( !dev->info.is_virtfn )
-    {
-        pbus = bus;
-        pslot = slot;
-        pfunc = func;
-        vf = -1;
-    }
-    else
-    {
-        pbus = dev->info.physfn.bus;
-        pslot = PCI_SLOT(dev->info.physfn.devfn);
-        pfunc = PCI_FUNC(dev->info.physfn.devfn);
-        vf = PCI_BDF2(dev->bus, dev->devfn);
-    }
-
-    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
-    WARN_ON(msi && msi->table_base != table_paddr);
-    if ( !table_paddr )
-    {
-        if ( !msi || !msi->table_base )
+        if ( !dev->info.is_virtfn )
         {
-            pci_conf_write16(dev->sbdf, msix_control_reg(pos),
-                             control & ~PCI_MSIX_FLAGS_ENABLE);
-            xfree(entry);
-            return -ENXIO;
+            pbus = bus;
+            pslot = slot;
+            pfunc = func;
+            vf = -1;
+        }
+        else
+        {
+            pbus = dev->info.physfn.bus;
+            pslot = PCI_SLOT(dev->info.physfn.devfn);
+            pfunc = PCI_FUNC(dev->info.physfn.devfn);
+            vf = PCI_BDF2(dev->bus, dev->devfn);
         }
-        table_paddr = msi->table_base;
-    }
-    table_paddr += table_offset;
 
-    if ( !msix->used_entries )
-    {
-        u64 pba_paddr;
-        u32 pba_offset;
+        table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+        WARN_ON(msi && msi->table_base != table_paddr);
+        if ( !table_paddr )
+        {
+            if ( !msi || !msi->table_base )
+            {
+                pci_conf_write16(dev->sbdf, msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
+                xfree(entry);
+                return -ENXIO;
+            }
+            table_paddr = msi->table_base;
+        }
+        table_paddr += table_offset & ~PCI_MSIX_BIRMASK;
 
         msix->table.first = PFN_DOWN(table_paddr);
         msix->table.last = PFN_DOWN(table_paddr +
@@ -877,7 +877,18 @@ static int msix_capability_init(struct p
                                   BITS_TO_LONGS(msix->nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                         msix->pba.last));
+
+        zap_on_error = true;
+    }
+    else if ( !msix->table.first )
+    {
+        pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+        xfree(entry);
+        return -ENODATA;
     }
+    else
+        table_paddr = (msix->table.first << PAGE_SHIFT) +
+                      PAGE_OFFSET(table_offset & ~PCI_MSIX_BIRMASK);
 
     if ( entry )
     {
@@ -888,8 +899,15 @@ static int msix_capability_init(struct p
 
         if ( idx < 0 )
         {
-            pci_conf_write16(dev->sbdf, msix_control_reg(pos),
-                             control & ~PCI_MSIX_FLAGS_ENABLE);
+            if ( zap_on_error )
+            {
+                msix->table.first = 0;
+                msix->pba.first = 0;
+
+                control &= ~PCI_MSIX_FLAGS_ENABLE;
+            }
+
+            pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
             xfree(entry);
             return idx;
         }
@@ -1078,9 +1096,14 @@ static void _pci_cleanup_msix(struct arc
         if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first,
                                    msix->table.last) )
             WARN();
+        msix->table.first = 0;
+        msix->table.last = 0;
+
         if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first,
                                    msix->pba.last) )
             WARN();
+        msix->pba.first = 0;
+        msix->pba.last = 0;
     }
 }
 
++++++ 5f6a062c-evtchn-relax-port_is_valid.patch -> 5f6a00c4-evtchn-relax-port_is_valid.patch ++++++
--- /work/SRC/openSUSE:Factory/xen/5f6a062c-evtchn-relax-port_is_valid.patch	2020-09-25 16:21:23.363351410 +0200
+++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a00c4-evtchn-relax-port_is_valid.patch	2020-11-02 09:38:38.753509315 +0100
@@ -1,7 +1,8 @@
-Subject: evtchn: relax port_is_valid()
-From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:11:56 2020 +0200
-Date: Tue Sep 22 16:11:56 2020 +0200:
-Git: e417504febf4ef5cc3442ebaa00da1a5c7939b22
+# Commit e59ce972d1280c6c55065da822e0860845582053
+# Date 2020-09-22 15:48:52 +0200
+# Author Jan Beulich <jbeulich@suse.com>
+# Committer Jan Beulich <jbeulich@suse.com>
+evtchn: relax port_is_valid()
 
 To avoid ports potentially becoming invalid behind the back of certain
 other functions (due to ->max_evtchn shrinking) because of
@@ -29,11 +30,9 @@
 Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
 Reviewed-by: Julien Grall <jgrall@amazon.com>
 
-diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
-index a7798f6765..ce45298377 100644
 --- a/xen/include/xen/event.h
 +++ b/xen/include/xen/event.h
-@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct domain *ld, int lport);
+@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct
  
  static inline bool_t port_is_valid(struct domain *d, unsigned int p)
  {

++++++ 5f6a00df-x86-PV-avoid-double-exception-injection.patch ++++++
# Commit 910093d54fc758e7d69261b344fdc8da3a7bd81e
# Date 2020-09-22 15:49:19 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/pv: Avoid double exception injection

There is at least one path (SYSENTER with NT set, Xen converts to #GP) which
ends up injecting the #GP fault twice, first in compat_sysenter(), and then a
second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left
in TRAPBOUNCE_flags.

The guest kernel sees the second fault first, which is a kernel level #GP
pointing at the head of the #GP handler, and is therefore a userspace
trigger-able DoS.

This particular bug has bitten us several times before, so rearrange
{compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than
leaving this task to one area of code which isn't used uniformly.

Other scenarios which might result in a double injection (e.g. two calls
directly to compat_create_bounce_frame) will now crash the guest, which is far
more obvious than letting the kernel run with corrupt state.

This is XSA-339

Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -78,7 +78,6 @@ compat_process_softirqs:
         sti
 .Lcompat_bounce_exception:
         call  compat_create_bounce_frame
-        movb  $0, TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
 	ALIGN
@@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_select
         movl  %eax,UREGS_cs+8(%rsp)
         movl  TRAPBOUNCE_eip(%rdx),%eax
         movl  %eax,UREGS_rip+8(%rsp)
+
+        /* Trapbounce complete.  Clobber state to avoid an erroneous second injection. */
+        xor   %eax, %eax
+        mov   %ax,  TRAPBOUNCE_cs(%rdx)
+        mov   %al,  TRAPBOUNCE_flags(%rdx)
         ret
+
 .section .fixup,"ax"
 .Lfx13:
         xorl  %edi,%edi
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -90,7 +90,6 @@ process_softirqs:
         sti
 .Lbounce_exception:
         call  create_bounce_frame
-        movb  $0, TRAPBOUNCE_flags(%rdx)
         jmp   test_all_events
 
         ALIGN
@@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_ba
         jmp   asm_domain_crash_synchronous  /* Does not return */
 __UNLIKELY_END(create_bounce_frame_bad_bounce_ip)
         movq  %rax,UREGS_rip+8(%rsp)
+
+        /* Trapbounce complete.  Clobber state to avoid an erroneous second injection. */
+        xor   %eax, %eax
+        mov   %rax, TRAPBOUNCE_eip(%rdx)
+        mov   %al,  TRAPBOUNCE_flags(%rdx)
         ret
 
         .pushsection .fixup, "ax", @progbits
++++++ 5f6a00f4-evtchn-add-missing-barriers.patch ++++++
# Commit 112992b05b2d2ca63f3c78eefe1cf8d192d7303a
# Date 2020-09-22 15:49:40 +0200
# Author Julien Grall <jgrall@amazon.com>
# Committer Jan Beulich <jbeulich@suse.com>
xen/evtchn: Add missing barriers when accessing/allocating an event channel

While the allocation of a bucket is always performed with the per-domain
lock, the bucket may be accessed without the lock taken (for instance, see
evtchn_send()).

Instead such sites relies on port_is_valid() to return a non-zero value
when the port has a struct evtchn associated to it. The function will
mostly check whether the port is less than d->valid_evtchns as all the
buckets/event channels should be allocated up to that point.

Unfortunately a compiler is free to re-order the assignment in
evtchn_allocate_port() so it would be possible to have d->valid_evtchns
updated before the new bucket has finish to allocate.

Additionally on Arm, even if this was compiled "correctly", the
processor can still re-order the memory access.

Add a write memory barrier in the allocation side and a read memory
barrier when the port is valid to prevent any re-ordering issue.

This is XSA-340.

Reported-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain *
             return -ENOMEM;
         bucket_from_port(d, port) = chn;
 
+        /*
+         * d->valid_evtchns is used to check whether the bucket can be
+         * accessed without the per-domain lock. Therefore,
+         * d->valid_evtchns should be seen *after* the new bucket has
+         * been setup.
+         */
+        smp_wmb();
         write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
     }
 
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct
 
 static inline bool_t port_is_valid(struct domain *d, unsigned int p)
 {
-    return p < read_atomic(&d->valid_evtchns);
+    if ( p >= read_atomic(&d->valid_evtchns) )
+        return false;
+
+    /*
+     * The caller will usually access the event channel afterwards and
+     * may be done without taking the per-domain lock. The barrier is
+     * going in pair the smp_wmb() barrier in evtchn_allocate_port().
+     */
+    smp_rmb();
+
+    return true;
 }
 
 static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
++++++ 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch -> 5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch ++++++
--- /work/SRC/openSUSE:Factory/xen/5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch	2020-09-25 16:21:28.159355656 +0200
+++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch	2020-11-02 09:38:38.825509384 +0100
@@ -1,7 +1,8 @@
-Subject: evtchn/x86: enforce correct upper limit for 32-bit guests
-From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:13:34 2020 +0200
-Date: Tue Sep 22 16:13:34 2020 +0200:
-Git: b8c2efbe7b3e8fa5f0b0a3679afccd1204949070
+# Commit 62bcdc4edbf6d8c6e8a25544d48de22ccf75310d
+# Date 2020-09-22 15:50:09 +0200
+# Author Jan Beulich <jbeulich@suse.com>
+# Committer Jan Beulich <jbeulich@suse.com>
+evtchn/x86: enforce correct upper limit for 32-bit guests
 
 The recording of d->max_evtchns in evtchn_2l_init(), in particular with
 the limited set of callers of the function, is insufficient. Neither for
@@ -34,11 +35,9 @@
 Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
 Reviewed-by: Julien Grall <jgrall@amazon.com>
 
-diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c
-index e1dbb860f4..a229d35271 100644
 --- a/xen/common/event_2l.c
 +++ b/xen/common/event_2l.c
-@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtchn_port_ops_2l =
+@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtc
  void evtchn_2l_init(struct domain *d)
  {
      d->evtchn_port_ops = &evtchn_port_ops_2l;
@@ -46,11 +45,9 @@
  }
  
  /*
-diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
-index 53c17bd354..08ffe0f063 100644
 --- a/xen/common/event_channel.c
 +++ b/xen/common/event_channel.c
-@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct domain *d, struct evtchn *bucket)
+@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct do
  
  int evtchn_allocate_port(struct domain *d, evtchn_port_t port)
  {
@@ -59,7 +56,7 @@
          return -ENOSPC;
  
      if ( port_is_valid(d, port) )
-@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(struct domain *d)
+@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(stru
  
      spin_lock(&d->event_lock);
  
@@ -74,11 +71,9 @@
          chn = evtchn_from_port(d, port);
          if ( chn->state == ECS_FREE )
              continue;
-diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c
-index 230f440f14..2f13d92128 100644
 --- a/xen/common/event_fifo.c
 +++ b/xen/common/event_fifo.c
-@@ -478,7 +478,7 @@ static void cleanup_event_array(struct domain *d)
+@@ -478,7 +478,7 @@ static void cleanup_event_array(struct d
      d->evtchn_fifo = NULL;
  }
  
@@ -87,7 +82,7 @@
  {
      unsigned int port;
  
-@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d)
+@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d
       * - save its pending state.
       * - set default priority.
       */
@@ -96,7 +91,7 @@
      {
          struct evtchn *evtchn;
  
-@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtchn_init_control *init_control)
+@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtc
      if ( !d->evtchn_fifo )
      {
          struct vcpu *vcb;
@@ -105,7 +100,7 @@
  
          for_each_vcpu ( d, vcb ) {
              rc = setup_control_block(vcb);
-@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtchn_init_control *init_control)
+@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtc
              goto error;
  
          d->evtchn_port_ops = &evtchn_port_ops_fifo;
@@ -115,11 +110,9 @@
      }
      else
          rc = map_control_block(v, gfn, offset);
-diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
-index cb49a8bc02..ab94d2ec3a 100644
 --- a/xen/common/sched/core.c
 +++ b/xen/common/sched/core.c
-@@ -1428,7 +1428,7 @@ static long do_poll(struct sched_poll *sched_poll)
+@@ -1428,7 +1428,7 @@ static long do_poll(struct sched_poll *s
              goto out;
  
          rc = -EINVAL;
@@ -128,11 +121,9 @@
              goto out;
  
          rc = 0;
-diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
-index c35f4b23b6..e1b299e8df 100644
 --- a/xen/include/xen/event.h
 +++ b/xen/include/xen/event.h
-@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport);
+@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct
  #define bucket_from_port(d, p) \
      ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET])
  
@@ -145,8 +136,6 @@
  static inline bool_t port_is_valid(struct domain *d, unsigned int p)
  {
      if ( p >= read_atomic(&d->valid_evtchns) )
-diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
-index ac53519d7f..545f2bdcd0 100644
 --- a/xen/include/xen/sched.h
 +++ b/xen/include/xen/sched.h
 @@ -359,7 +359,6 @@ struct domain

++++++ 5f6a013f-evtchn_reset-shouldnt-succeed-with.patch ++++++
# Commit 8d385b247bca40ece40c9279391054bc98934325
# Date 2020-09-22 15:50:55 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: evtchn_reset() shouldn't succeed with still-open ports

While the function closes all ports, it does so without holding any
lock, and hence racing requests may be issued causing new ports to get
opened. This would have been problematic in particular if such a newly
opened port had a port number above the new implementation limit (i.e.
when switching from FIFO to 2-level) after the reset, as prior to
"evtchn: relax port_is_valid()" this could have led to e.g.
evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger.

Introduce a counter of active ports and check that it's (still) no
larger then the number of Xen internally used ones after obtaining the
necessary lock in evtchn_reset().

As to the access model of the new {active,xen}_evtchns fields - while
all writes get done using write_atomic(), reads ought to use
read_atomic() only when outside of a suitably locked region.

Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have
a need to call check_free_port().

This is part of XSA-343.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Reviewed-by: Julien Grall <jgrall@amazon.com>

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain *
         write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
     }
 
+    write_atomic(&d->active_evtchns, d->active_evtchns + 1);
+
     return 0;
 }
 
@@ -211,11 +213,26 @@ static int get_free_port(struct domain *
     return -ENOSPC;
 }
 
+/*
+ * Check whether a port is still marked free, and if so update the domain
+ * counter accordingly.  To be used on function exit paths.
+ */
+static void check_free_port(struct domain *d, evtchn_port_t port)
+{
+    if ( port_is_valid(d, port) &&
+         evtchn_from_port(d, port)->state == ECS_FREE )
+        write_atomic(&d->active_evtchns, d->active_evtchns - 1);
+}
+
 void evtchn_free(struct domain *d, struct evtchn *chn)
 {
     /* Clear pending event to avoid unexpected behavior on re-bind. */
     evtchn_port_clear_pending(d, chn);
 
+    if ( consumer_is_xen(chn) )
+        write_atomic(&d->xen_evtchns, d->xen_evtchns - 1);
+    write_atomic(&d->active_evtchns, d->active_evtchns - 1);
+
     /* Reset binding to vcpu0 when the channel is freed. */
     chn->state          = ECS_FREE;
     chn->notify_vcpu_id = 0;
@@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_
     alloc->port = port;
 
  out:
+    check_free_port(d, port);
     spin_unlock(&d->event_lock);
     rcu_unlock_domain(d);
 
@@ -351,6 +369,7 @@ static long evtchn_bind_interdomain(evtc
     bind->local_port = lport;
 
  out:
+    check_free_port(ld, lport);
     spin_unlock(&ld->event_lock);
     if ( ld != rd )
         spin_unlock(&rd->event_lock);
@@ -488,7 +507,7 @@ static long evtchn_bind_pirq(evtchn_bind
     struct domain *d = current->domain;
     struct vcpu   *v = d->vcpu[0];
     struct pirq   *info;
-    int            port, pirq = bind->pirq;
+    int            port = 0, pirq = bind->pirq;
     long           rc;
 
     if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
@@ -536,6 +555,7 @@ static long evtchn_bind_pirq(evtchn_bind
     arch_evtchn_bind_pirq(d, pirq);
 
  out:
+    check_free_port(d, port);
     spin_unlock(&d->event_lock);
 
     return rc;
@@ -1011,10 +1031,10 @@ int evtchn_unmask(unsigned int port)
     return 0;
 }
 
-
 int evtchn_reset(struct domain *d)
 {
     unsigned int i;
+    int rc = 0;
 
     if ( d != current->domain && !d->controller_pause_count )
         return -EINVAL;
@@ -1024,7 +1044,9 @@ int evtchn_reset(struct domain *d)
 
     spin_lock(&d->event_lock);
 
-    if ( d->evtchn_fifo )
+    if ( d->active_evtchns > d->xen_evtchns )
+        rc = -EAGAIN;
+    else if ( d->evtchn_fifo )
     {
         /* Switching back to 2-level ABI. */
         evtchn_fifo_destroy(d);
@@ -1033,7 +1055,7 @@ int evtchn_reset(struct domain *d)
 
     spin_unlock(&d->event_lock);
 
-    return 0;
+    return rc;
 }
 
 static long evtchn_set_priority(const struct evtchn_set_priority *set_priority)
@@ -1219,10 +1241,9 @@ int alloc_unbound_xen_event_channel(
 
     spin_lock(&ld->event_lock);
 
-    rc = get_free_port(ld);
+    port = rc = get_free_port(ld);
     if ( rc < 0 )
         goto out;
-    port = rc;
     chn = evtchn_from_port(ld, port);
 
     rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid);
@@ -1238,7 +1259,10 @@ int alloc_unbound_xen_event_channel(
 
     spin_unlock(&chn->lock);
 
+    write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
+
  out:
+    check_free_port(ld, port);
     spin_unlock(&ld->event_lock);
 
     return rc < 0 ? rc : port;
@@ -1314,6 +1338,7 @@ int evtchn_init(struct domain *d, unsign
         return -EINVAL;
     }
     evtchn_from_port(d, 0)->state = ECS_RESERVED;
+    write_atomic(&d->active_evtchns, 0);
 
 #if MAX_VIRT_CPUS > BITS_PER_LONG
     d->poll_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus));
@@ -1340,6 +1365,8 @@ void evtchn_destroy(struct domain *d)
     for ( i = 0; port_is_valid(d, i); i++ )
         evtchn_close(d, i, 0);
 
+    ASSERT(!d->active_evtchns);
+
     clear_global_virq_handlers(d);
 
     evtchn_fifo_destroy(d);
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -361,6 +361,16 @@ struct domain
     struct evtchn  **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
     unsigned int     max_evtchn_port; /* max permitted port number */
     unsigned int     valid_evtchns;   /* number of allocated event channels */
+    /*
+     * Number of in-use event channels.  Writers should use write_atomic().
+     * Readers need to use read_atomic() only when not holding event_lock.
+     */
+    unsigned int     active_evtchns;
+    /*
+     * Number of event channels used internally by Xen (not subject to
+     * EVTCHNOP_reset).  Read/write access like for active_evtchns.
+     */
+    unsigned int     xen_evtchns;
     spinlock_t       event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;
++++++ 5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch ++++++
# Commit c0ddc8634845aba50774add6e4b73fdaffc82656
# Date 2020-09-22 15:51:28 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: convert per-channel lock to be IRQ-safe

... in order for send_guest_{global,vcpu}_virq() to be able to make use
of it.

This is part of XSA-343.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_
     int            port;
     domid_t        dom = alloc->dom;
     long           rc;
+    unsigned long  flags;
 
     d = rcu_lock_domain_by_any_id(dom);
     if ( d == NULL )
@@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_
     if ( rc )
         goto out;
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state = ECS_UNBOUND;
     if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
         chn->u.unbound.remote_domid = current->domain->domain_id;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     alloc->port = port;
 
@@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_
 }
 
 
-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
+static unsigned long double_evtchn_lock(struct evtchn *lchn,
+                                        struct evtchn *rchn)
 {
-    if ( lchn < rchn )
+    unsigned long flags;
+
+    if ( lchn <= rchn )
     {
-        spin_lock(&lchn->lock);
-        spin_lock(&rchn->lock);
+        spin_lock_irqsave(&lchn->lock, flags);
+        if ( lchn != rchn )
+            spin_lock(&rchn->lock);
     }
     else
     {
-        if ( lchn != rchn )
-            spin_lock(&rchn->lock);
+        spin_lock_irqsave(&rchn->lock, flags);
         spin_lock(&lchn->lock);
     }
+
+    return flags;
 }
 
-static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn,
+                                 unsigned long flags)
 {
-    spin_unlock(&lchn->lock);
     if ( lchn != rchn )
-        spin_unlock(&rchn->lock);
+        spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&rchn->lock, flags);
 }
 
 static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
@@ -312,6 +319,7 @@ static long evtchn_bind_interdomain(evtc
     int            lport, rport = bind->remote_port;
     domid_t        rdom = bind->remote_dom;
     long           rc;
+    unsigned long  flags;
 
     if ( rdom == DOMID_SELF )
         rdom = current->domain->domain_id;
@@ -347,7 +355,7 @@ static long evtchn_bind_interdomain(evtc
     if ( rc )
         goto out;
 
-    double_evtchn_lock(lchn, rchn);
+    flags = double_evtchn_lock(lchn, rchn);
 
     lchn->u.interdomain.remote_dom  = rd;
     lchn->u.interdomain.remote_port = rport;
@@ -364,7 +372,7 @@ static long evtchn_bind_interdomain(evtc
      */
     evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
 
-    double_evtchn_unlock(lchn, rchn);
+    double_evtchn_unlock(lchn, rchn, flags);
 
     bind->local_port = lport;
 
@@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t
     struct domain *d = current->domain;
     int            virq = bind->virq, vcpu = bind->vcpu;
     int            rc = 0;
+    unsigned long  flags;
 
     if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) )
         return -EINVAL;
@@ -424,14 +433,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t
 
     chn = evtchn_from_port(d, port);
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state          = ECS_VIRQ;
     chn->notify_vcpu_id = vcpu;
     chn->u.virq         = virq;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     v->virq_to_evtchn[virq] = bind->port = port;
 
@@ -448,6 +457,7 @@ static long evtchn_bind_ipi(evtchn_bind_
     struct domain *d = current->domain;
     int            port, vcpu = bind->vcpu;
     long           rc = 0;
+    unsigned long  flags;
 
     if ( domain_vcpu(d, vcpu) == NULL )
         return -ENOENT;
@@ -459,13 +469,13 @@ static long evtchn_bind_ipi(evtchn_bind_
 
     chn = evtchn_from_port(d, port);
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state          = ECS_IPI;
     chn->notify_vcpu_id = vcpu;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     bind->port = port;
 
@@ -509,6 +519,7 @@ static long evtchn_bind_pirq(evtchn_bind
     struct pirq   *info;
     int            port = 0, pirq = bind->pirq;
     long           rc;
+    unsigned long  flags;
 
     if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
         return -EINVAL;
@@ -541,14 +552,14 @@ static long evtchn_bind_pirq(evtchn_bind
         goto out;
     }
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state  = ECS_PIRQ;
     chn->u.pirq.irq = pirq;
     link_pirq_port(port, chn, v);
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     bind->port = port;
 
@@ -569,6 +580,7 @@ int evtchn_close(struct domain *d1, int
     struct evtchn *chn1, *chn2;
     int            port2;
     long           rc = 0;
+    unsigned long  flags;
 
  again:
     spin_lock(&d1->event_lock);
@@ -668,14 +680,14 @@ int evtchn_close(struct domain *d1, int
         BUG_ON(chn2->state != ECS_INTERDOMAIN);
         BUG_ON(chn2->u.interdomain.remote_dom != d1);
 
-        double_evtchn_lock(chn1, chn2);
+        flags = double_evtchn_lock(chn1, chn2);
 
         evtchn_free(d1, chn1);
 
         chn2->state = ECS_UNBOUND;
         chn2->u.unbound.remote_domid = d1->domain_id;
 
-        double_evtchn_unlock(chn1, chn2);
+        double_evtchn_unlock(chn1, chn2, flags);
 
         goto out;
 
@@ -683,9 +695,9 @@ int evtchn_close(struct domain *d1, int
         BUG();
     }
 
-    spin_lock(&chn1->lock);
+    spin_lock_irqsave(&chn1->lock, flags);
     evtchn_free(d1, chn1);
-    spin_unlock(&chn1->lock);
+    spin_unlock_irqrestore(&chn1->lock, flags);
 
  out:
     if ( d2 != NULL )
@@ -705,13 +717,14 @@ int evtchn_send(struct domain *ld, unsig
     struct evtchn *lchn, *rchn;
     struct domain *rd;
     int            rport, ret = 0;
+    unsigned long  flags;
 
     if ( !port_is_valid(ld, lport) )
         return -EINVAL;
 
     lchn = evtchn_from_port(ld, lport);
 
-    spin_lock(&lchn->lock);
+    spin_lock_irqsave(&lchn->lock, flags);
 
     /* Guest cannot send via a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(lchn)) )
@@ -746,7 +759,7 @@ int evtchn_send(struct domain *ld, unsig
     }
 
 out:
-    spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&lchn->lock, flags);
 
     return ret;
 }
@@ -1238,6 +1251,7 @@ int alloc_unbound_xen_event_channel(
 {
     struct evtchn *chn;
     int            port, rc;
+    unsigned long  flags;
 
     spin_lock(&ld->event_lock);
 
@@ -1250,14 +1264,14 @@ int alloc_unbound_xen_event_channel(
     if ( rc )
         goto out;
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state = ECS_UNBOUND;
     chn->xen_consumer = get_xen_consumer(notification_fn);
     chn->notify_vcpu_id = lvcpu;
     chn->u.unbound.remote_domid = remote_domid;
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
 
@@ -1280,11 +1294,12 @@ void notify_via_xen_event_channel(struct
 {
     struct evtchn *lchn, *rchn;
     struct domain *rd;
+    unsigned long flags;
 
     ASSERT(port_is_valid(ld, lport));
     lchn = evtchn_from_port(ld, lport);
 
-    spin_lock(&lchn->lock);
+    spin_lock_irqsave(&lchn->lock, flags);
 
     if ( likely(lchn->state == ECS_INTERDOMAIN) )
     {
@@ -1294,7 +1309,7 @@ void notify_via_xen_event_channel(struct
         evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
     }
 
-    spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&lchn->lock, flags);
 }
 
 void evtchn_check_pollers(struct domain *d, unsigned int port)
++++++ 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch -> 5f6a0178-evtchn-address-races-with-evtchn_reset.patch ++++++
--- /work/SRC/openSUSE:Factory/xen/5f6a06f2-evtchn-address-races-with-evtchn_reset.patch	2020-09-25 16:21:30.175357441 +0200
+++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a0178-evtchn-address-races-with-evtchn_reset.patch	2020-11-02 09:38:38.893509450 +0100
@@ -1,7 +1,8 @@
-Subject: evtchn: address races with evtchn_reset()
-From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:15:14 2020 +0200
-Date: Tue Sep 22 16:15:14 2020 +0200:
-Git: ecc6428b7ea63a24e244f747e8568c0ccc03a6f8
+# Commit e045199c7c9c5433d7f1461a741ed539a75cbfad
+# Date 2020-09-22 15:51:52 +0200
+# Author Jan Beulich <jbeulich@suse.com>
+# Committer Jan Beulich <jbeulich@suse.com>
+evtchn: address races with evtchn_reset()
 
 Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely
 lock-less manner, as both may change by a racing evtchn_reset(). In the
@@ -29,8 +30,6 @@
 Signed-off-by: Jan Beulich <jbeulich@suse.com>
 Acked-by: Julien Grall <jgrall@amazon.com>
 
-diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
-index a69937c840..93c4fb9a79 100644
 --- a/xen/arch/x86/irq.c
 +++ b/xen/arch/x86/irq.c
 @@ -2488,14 +2488,24 @@ static void dump_irqs(unsigned char key)
@@ -62,11 +61,9 @@
                         i < action->nr_guests ? ',' : '\n');
              }
          }
-diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c
-index 3a0525c209..9aef7a860a 100644
 --- a/xen/arch/x86/pv/shim.c
 +++ b/xen/arch/x86/pv/shim.c
-@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int port)
+@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int
      if ( port_is_valid(guest, port) )
      {
          struct evtchn *chn = evtchn_from_port(guest, port);
@@ -78,11 +75,9 @@
      }
  }
  
-diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c
-index a229d35271..083d04be3c 100644
 --- a/xen/common/event_2l.c
 +++ b/xen/common/event_2l.c
-@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct domain *d, struct evtchn *evtchn)
+@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct doma
      }
  }
  
@@ -94,7 +89,7 @@
      unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
  
      ASSERT(port < max_ports);
-@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port)
+@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const s
              guest_test_bit(d, port, &shared_info(d, evtchn_pending)));
  }
  
@@ -106,11 +101,9 @@
      unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
  
      ASSERT(port < max_ports);
-diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
-index 0e550e9c7a..878e4250ed 100644
 --- a/xen/common/event_channel.c
 +++ b/xen/common/event_channel.c
-@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain *d, evtchn_port_t port)
+@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain *
  
      if ( port_is_valid(d, port) )
      {
@@ -122,7 +115,7 @@
              return -EBUSY;
      }
      else
-@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
+@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v
      unsigned long flags;
      int port;
      struct domain *d;
@@ -130,7 +123,7 @@
  
      ASSERT(!virq_is_global(virq));
  
-@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
+@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v
          goto out;
  
      d = v->domain;
@@ -142,7 +135,7 @@
  
   out:
      spin_unlock_irqrestore(&v->virq_lock, flags);
-@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domain *d, uint32_t virq)
+@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domai
          goto out;
  
      chn = evtchn_from_port(d, port);
@@ -152,7 +145,7 @@
  
   out:
      spin_unlock_irqrestore(&v->virq_lock, flags);
-@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, const struct pirq *pirq)
+@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, c
  {
      int port;
      struct evtchn *chn;
@@ -160,7 +153,7 @@
  
      /*
       * PV guests: It should not be possible to race with __evtchn_close(). The
-@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, const struct pirq *pirq)
+@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, c
      }
  
      chn = evtchn_from_port(d, port);
@@ -186,7 +179,7 @@
  
      return 0;
  }
-@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(struct domain *d)
+@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(stru
  
          printk("    %4u [%d/%d/",
                 port,
@@ -197,11 +190,9 @@
          evtchn_port_print_state(d, chn);
          printk("]: s=%d n=%d x=%d",
                 chn->state, chn->notify_vcpu_id, chn->xen_consumer);
-diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c
-index 2f13d92128..68d0c7a632 100644
 --- a/xen/common/event_fifo.c
 +++ b/xen/common/event_fifo.c
-@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct domain *d, struct evtchn *evtchn)
+@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct do
          evtchn_fifo_set_pending(v, evtchn);
  }
  
@@ -234,11 +225,9 @@
  
      return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word);
  }
-diff --git a/xen/include/asm-x86/event.h b/xen/include/asm-x86/event.h
-index 98a85233cb..5e09ede6d7 100644
 --- a/xen/include/asm-x86/event.h
 +++ b/xen/include/asm-x86/event.h
-@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(unsigned int virq)
+@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(u
      return true;
  }
  
@@ -249,11 +238,9 @@
 +#endif
 +
  #endif
-diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
-index e1b299e8df..bc9aa68650 100644
 --- a/xen/include/xen/event.h
 +++ b/xen/include/xen/event.h
-@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
+@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from
      return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET);
  }
  
@@ -306,7 +293,7 @@
      int (*set_priority)(struct domain *d, struct evtchn *evtchn,
                          unsigned int priority);
      void (*print_state)(struct domain *d, const struct evtchn *evtchn);
-@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pending(struct domain *d,
+@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pendi
                                             unsigned int vcpu_id,
                                             struct evtchn *evtchn)
  {
@@ -386,7 +373,7 @@
  }
  
  static inline int evtchn_port_set_priority(struct domain *d,
-@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priority(struct domain *d,
+@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priori
  {
      if ( !d->evtchn_port_ops->set_priority )
          return -ENOSYS;

++++++ 5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch ++++++
# Commit 8fe7b5f9960f4d9ec46787af394d180c39c6b2db
# Date 2020-09-22 15:52:36 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: arrange for preemption in evtchn_destroy()

Especially closing of fully established interdomain channels can take
quite some time, due to the locking involved. Therefore we shouldn't
assume we can clean up still active ports all in one go. Besides adding
the necessary preemption check, also avoid pointlessly starting from
(or now really ending at) 0; 1 is the lowest numbered port which may
need closing.

Since we're now reducing ->valid_evtchns, free_xen_event_channel(),
and (at least to be on the safe side) notify_via_xen_event_channel()
need to cope with attempts to close / unbind from / send through already
closed (and no longer valid, as per port_is_valid()) ports.

This is part of XSA-344.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>

--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -715,12 +715,14 @@ int domain_kill(struct domain *d)
             return domain_kill(d);
         d->is_dying = DOMDYING_dying;
         argo_destroy(d);
-        evtchn_destroy(d);
         gnttab_release_mappings(d);
         vnuma_destroy(d->vnuma);
         domain_set_outstanding_pages(d, 0);
         /* fallthrough */
     case DOMDYING_dying:
+        rc = evtchn_destroy(d);
+        if ( rc )
+            break;
         rc = domain_relinquish_resources(d);
         if ( rc != 0 )
             break;
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -1297,7 +1297,16 @@ int alloc_unbound_xen_event_channel(
 
 void free_xen_event_channel(struct domain *d, int port)
 {
-    BUG_ON(!port_is_valid(d, port));
+    if ( !port_is_valid(d, port) )
+    {
+        /*
+         * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
+         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         */
+        smp_rmb();
+        BUG_ON(!d->is_dying);
+        return;
+    }
 
     evtchn_close(d, port, 0);
 }
@@ -1309,7 +1318,17 @@ void notify_via_xen_event_channel(struct
     struct domain *rd;
     unsigned long flags;
 
-    ASSERT(port_is_valid(ld, lport));
+    if ( !port_is_valid(ld, lport) )
+    {
+        /*
+         * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
+         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         */
+        smp_rmb();
+        ASSERT(ld->is_dying);
+        return;
+    }
+
     lchn = evtchn_from_port(ld, lport);
 
     spin_lock_irqsave(&lchn->lock, flags);
@@ -1380,8 +1399,7 @@ int evtchn_init(struct domain *d, unsign
     return 0;
 }
 
-
-void evtchn_destroy(struct domain *d)
+int evtchn_destroy(struct domain *d)
 {
     unsigned int i;
 
@@ -1390,14 +1408,29 @@ void evtchn_destroy(struct domain *d)
     spin_barrier(&d->event_lock);
 
     /* Close all existing event channels. */
-    for ( i = 0; port_is_valid(d, i); i++ )
+    for ( i = d->valid_evtchns; --i; )
+    {
         evtchn_close(d, i, 0);
 
+        /*
+         * Avoid preempting when called from domain_create()'s error path,
+         * and don't check too often (choice of frequency is arbitrary).
+         */
+        if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead &&
+             hypercall_preempt_check() )
+        {
+            write_atomic(&d->valid_evtchns, i);
+            return -ERESTART;
+        }
+    }
+
     ASSERT(!d->active_evtchns);
 
     clear_global_virq_handlers(d);
 
     evtchn_fifo_destroy(d);
+
+    return 0;
 }
 
 
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -138,7 +138,7 @@ struct evtchn
 } __attribute__((aligned(64)));
 
 int  evtchn_init(struct domain *d, unsigned int max_port);
-void evtchn_destroy(struct domain *d); /* from domain_kill */
+int  evtchn_destroy(struct domain *d); /* from domain_kill */
 void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
 
 struct waitqueue_vcpu;
++++++ 5f6a01c6-evtchn-preempt-in-evtchn_reset.patch ++++++
# Commit 2785b2a9e04abc148e1c5259f4faee708ea356f4
# Date 2020-09-22 15:53:10 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn: arrange for preemption in evtchn_reset()

Like for evtchn_destroy() looping over all possible event channels to
close them can take a significant amount of time. Unlike done there, we
can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a
lightweight form, the paging domctl continuation concept, redirecting
the continuations to different sub-ops. Just like there this is to be
able to allow for predictable overall results of the involved sub-ops:
Racing requests should either complete or be refused.

Note that a domain can't interfere with an already started (by a remote
domain) reset, due to being paused. It can prevent a remote reset from
happening by leaving a reset unfinished, but that's only going to affect
itself.

This is part of XSA-344.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>

--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1145,7 +1145,7 @@ void domain_unpause_except_self(struct d
         domain_unpause(d);
 }
 
-int domain_soft_reset(struct domain *d)
+int domain_soft_reset(struct domain *d, bool resuming)
 {
     struct vcpu *v;
     int rc;
@@ -1159,7 +1159,7 @@ int domain_soft_reset(struct domain *d)
         }
     spin_unlock(&d->shutdown_lock);
 
-    rc = evtchn_reset(d);
+    rc = evtchn_reset(d, resuming);
     if ( rc )
         return rc;
 
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -562,12 +562,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
     }
 
     case XEN_DOMCTL_soft_reset:
+    case XEN_DOMCTL_soft_reset_cont:
         if ( d == current->domain ) /* no domain_pause() */
         {
             ret = -EINVAL;
             break;
         }
-        ret = domain_soft_reset(d);
+        ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont);
+        if ( ret == -ERESTART )
+        {
+            op->cmd = XEN_DOMCTL_soft_reset_cont;
+            if ( !__copy_field_to_guest(u_domctl, op, cmd) )
+                ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                    "h", u_domctl);
+            else
+                ret = -EFAULT;
+        }
         break;
 
     case XEN_DOMCTL_destroydomain:
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -1057,7 +1057,7 @@ int evtchn_unmask(unsigned int port)
     return 0;
 }
 
-int evtchn_reset(struct domain *d)
+int evtchn_reset(struct domain *d, bool resuming)
 {
     unsigned int i;
     int rc = 0;
@@ -1065,11 +1065,40 @@ int evtchn_reset(struct domain *d)
     if ( d != current->domain && !d->controller_pause_count )
         return -EINVAL;
 
-    for ( i = 0; port_is_valid(d, i); i++ )
+    spin_lock(&d->event_lock);
+
+    /*
+     * If we are resuming, then start where we stopped. Otherwise, check
+     * that a reset operation is not already in progress, and if none is,
+     * record that this is now the case.
+     */
+    i = resuming ? d->next_evtchn : !d->next_evtchn;
+    if ( i > d->next_evtchn )
+        d->next_evtchn = i;
+
+    spin_unlock(&d->event_lock);
+
+    if ( !i )
+        return -EBUSY;
+
+    for ( ; port_is_valid(d, i); i++ )
+    {
         evtchn_close(d, i, 1);
 
+        /* NB: Choice of frequency is arbitrary. */
+        if ( !(i & 0x3f) && hypercall_preempt_check() )
+        {
+            spin_lock(&d->event_lock);
+            d->next_evtchn = i;
+            spin_unlock(&d->event_lock);
+            return -ERESTART;
+        }
+    }
+
     spin_lock(&d->event_lock);
 
+    d->next_evtchn = 0;
+
     if ( d->active_evtchns > d->xen_evtchns )
         rc = -EAGAIN;
     else if ( d->evtchn_fifo )
@@ -1204,7 +1233,8 @@ long do_event_channel_op(int cmd, XEN_GU
         break;
     }
 
-    case EVTCHNOP_reset: {
+    case EVTCHNOP_reset:
+    case EVTCHNOP_reset_cont: {
         struct evtchn_reset reset;
         struct domain *d;
 
@@ -1217,9 +1247,13 @@ long do_event_channel_op(int cmd, XEN_GU
 
         rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d);
         if ( !rc )
-            rc = evtchn_reset(d);
+            rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont);
 
         rcu_unlock_domain(d);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op,
+                                               "ih", EVTCHNOP_reset_cont, arg);
         break;
     }
 
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1159,7 +1159,10 @@ struct xen_domctl {
 #define XEN_DOMCTL_iomem_permission              20
 #define XEN_DOMCTL_ioport_permission             21
 #define XEN_DOMCTL_hypercall_init                22
-#define XEN_DOMCTL_arch_setup                    23 /* Obsolete IA64 only */
+#ifdef __XEN__
+/* #define XEN_DOMCTL_arch_setup                 23 Obsolete IA64 only */
+#define XEN_DOMCTL_soft_reset_cont               23
+#endif
 #define XEN_DOMCTL_settimeoffset                 24
 #define XEN_DOMCTL_getvcpuaffinity               25
 #define XEN_DOMCTL_real_mode_area                26 /* Obsolete PPC only */
--- a/xen/include/public/event_channel.h
+++ b/xen/include/public/event_channel.h
@@ -74,6 +74,9 @@
 #define EVTCHNOP_init_control    11
 #define EVTCHNOP_expand_array    12
 #define EVTCHNOP_set_priority    13
+#ifdef __XEN__
+#define EVTCHNOP_reset_cont      14
+#endif
 /* ` } */
 
 typedef uint32_t evtchn_port_t;
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -179,7 +179,7 @@ void evtchn_check_pollers(struct domain
 void evtchn_2l_init(struct domain *d);
 
 /* Close all event channels and reset to 2-level ABI. */
-int evtchn_reset(struct domain *d);
+int evtchn_reset(struct domain *d, bool resuming);
 
 /*
  * Low-level event channel port ops.
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -371,6 +371,8 @@ struct domain
      * EVTCHNOP_reset).  Read/write access like for active_evtchns.
      */
     unsigned int     xen_evtchns;
+    /* Port to resume from in evtchn_reset(), when in a continuation. */
+    unsigned int     next_evtchn;
     spinlock_t       event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;
@@ -663,7 +665,7 @@ int domain_kill(struct domain *d);
 int domain_shutdown(struct domain *d, u8 reason);
 void domain_resume(struct domain *d);
 
-int domain_soft_reset(struct domain *d);
+int domain_soft_reset(struct domain *d, bool resuming);
 
 int vcpu_start_shutdown_deferral(struct vcpu *v);
 void vcpu_end_shutdown_deferral(struct vcpu *v);
++++++ 5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch ++++++
# Commit 61d4a04349895edc5a5868274b906ba61ef24f47
# Date 2020-09-24 21:02:35 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Don't deliver #GP for a SYSENTER with NT set

It is a matter of guest kernel policy what to do with offending userspace, and
terminating said userspace may not be the action chosen.

Linux explicitly tolerates this case.

Reported-by: Andy Lutomirski <luto@kernel.org>
Fixes: fdac951560 ("x86: clear EFLAGS.NT in SYSENTER entry path")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -313,7 +313,6 @@ UNLIKELY_START(nz, sysenter_nt_set)
         pushfq
         andl  $~X86_EFLAGS_NT,(%rsp)
         popfq
-        xorl  %eax,%eax
 UNLIKELY_END(sysenter_nt_set)
         testq %rax,%rax
         leal  (,%rcx,TBF_INTERRUPT),%ecx
++++++ 5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch ++++++
# Commit 5bcac985498ed83d89666959175ca9c9ed561ae1
# Date 2020-09-24 21:02:35 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Don't clobber NT on return-to-guest

A 64bit IRET can restore NT - the faulting case is when NT is set in the live
flags.  This change had an unintended consequence of causing the NT flag to
spontaneously disappear from guest context whenever a interrupt/exception
occurred.

In combination with a SYSENTER which sets both TF and NT, Xen's handling of
the #DB exceptions clears NT before it is even recorded suitably in the guest
kernel's view of what userspace was doing.

Reported-by: Andy Lutomirski <luto@kernel.org>
Fixes: 0e47f92b0 ("x86: force EFLAGS.IF on when exiting to PV guests")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -119,7 +119,7 @@ compat_process_trap:
 /* %rbx: struct vcpu, interrupts disabled */
 ENTRY(compat_restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
-        mov   $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),%r11d
+        mov   $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11d
         and   UREGS_eflags(%rsp),%r11d
 
 .macro alt_cr4_pv32
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -182,7 +182,7 @@ restore_all_guest:
         jz    iret_exit_to_guest
 
         movq  24(%rsp),%r11           # RFLAGS
-        andq  $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),%r11
+        andq  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11
         orq   $X86_EFLAGS_IF,%r11
 
         /* Don't use SYSRET path if the return address is not canonical. */
@@ -213,7 +213,7 @@ restore_all_guest:
         movq  8(%rsp), %rcx           # RIP
 /* No special register assumptions. */
 iret_exit_to_guest:
-        andl  $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),24(%rsp)
+        andl  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp)
         orl   $X86_EFLAGS_IF,24(%rsp)
         addq  $8,%rsp
 .Lft0:  iretq
++++++ 5f71a21e-x86-S3-fix-shadow-stack-resume.patch ++++++
# Commit 4bdbf746ac9152e70f264f87db4472707da805ce
# Date 2020-09-28 10:43:10 +0200
# Author Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/S3: fix shadow stack resume path

Fix the resume path to load the shadow stack pointer from saved_ssp (not
saved_rsp), to match what suspend path does.

Fixes: 633ecc4a7cb2 ("x86/S3: Save and restore Shadow Stack configuration")
Backport: 4.14
Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/acpi/wakeup_prot.S
+++ b/xen/arch/x86/acpi/wakeup_prot.S
@@ -69,7 +69,7 @@ ENTRY(s3_resume)
          * so SETSSBSY will successfully load a value useful for us, then
          * reset MSR_PL0_SSP to its usual value and pop the temporary token.
          */
-        mov     saved_rsp(%rip), %rdi
+        mov     saved_ssp(%rip), %rdi
         cmpq    $1, %rdi
         je      .L_shstk_done
 
++++++ 5f76ca65-evtchn-Flask-prealloc-for-send.patch ++++++
# Commit 52e1fc47abc3a0123d2b5bb7e9172e84fd571851
# Date 2020-10-02 08:36:21 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn/Flask: pre-allocate node on send path

xmalloc() & Co may not be called with IRQs off, or else check_lock()
will have its assertion trigger about locks getting acquired
inconsistently. Re-arranging the locking in evtchn_send() doesn't seem
very reasonable, especially since the per-channel lock was introduced to
avoid acquiring the per-domain event lock on the send paths. Issue a
second call to xsm_evtchn_send() instead, before acquiring the lock, to
give XSM / Flask a chance to pre-allocate whatever it may need.

As these nodes are used merely for caching earlier decisions' results,
allocate just one node in AVC code despite two potentially being needed.
Things will merely be not as performant if a second allocation was
wanted, just like when the pre-allocation fails.

Fixes: c0ddc8634845 ("evtchn: convert per-channel lock to be IRQ-safe")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Tested-by: Jason Andryuk <jandryuk@gmail.com>
Acked-by: Julien Grall <jgrall@amazon.com>
Reviewed-by: Jason Andryuk <jandryuk@gmail.com>

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -723,6 +723,12 @@ int evtchn_send(struct domain *ld, unsig
     if ( !port_is_valid(ld, lport) )
         return -EINVAL;
 
+    /*
+     * As the call further down needs to avoid allocations (due to running
+     * with IRQs off), give XSM a chance to pre-allocate if needed.
+     */
+    xsm_evtchn_send(XSM_HOOK, ld, NULL);
+
     lchn = evtchn_from_port(ld, lport);
 
     spin_lock_irqsave(&lchn->lock, flags);
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -59,6 +59,7 @@ struct xsm_operations {
     int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1,
                                         struct domain *d2, struct evtchn *chn2);
     void (*evtchn_close_post) (struct evtchn *chn);
+    /* Note: Next hook may be called with 'chn' set to NULL. See call site. */
     int (*evtchn_send) (struct domain *d, struct evtchn *chn);
     int (*evtchn_status) (struct domain *d, struct evtchn *chn);
     int (*evtchn_reset) (struct domain *d1, struct domain *d2);
--- a/xen/xsm/flask/avc.c
+++ b/xen/xsm/flask/avc.c
@@ -24,7 +24,9 @@
 #include <xen/prefetch.h>
 #include <xen/kernel.h>
 #include <xen/sched.h>
+#include <xen/cpu.h>
 #include <xen/init.h>
+#include <xen/percpu.h>
 #include <xen/rcupdate.h>
 #include <asm/atomic.h>
 #include <asm/current.h>
@@ -341,17 +343,79 @@ static inline int avc_reclaim_node(void)
     return ecx;
 }
 
+static struct avc_node *new_node(void)
+{
+    struct avc_node *node = xzalloc(struct avc_node);
+
+    if ( node )
+    {
+        INIT_RCU_HEAD(&node->rhead);
+        INIT_HLIST_NODE(&node->list);
+        avc_cache_stats_incr(allocations);
+    }
+
+    return node;
+}
+
+/*
+ * avc_has_perm_noaudit() may consume up to two nodes, which we may not be
+ * able to obtain from the allocator at that point. Since the is merely
+ * about caching earlier decisions, allow for (just) one pre-allocated node.
+ */
+static DEFINE_PER_CPU(struct avc_node *, prealloc_node);
+
+void avc_prealloc(void)
+{
+    struct avc_node **prealloc = &this_cpu(prealloc_node);
+
+    if ( !*prealloc )
+        *prealloc = new_node();
+}
+
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+                        void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    struct avc_node **prealloc = &per_cpu(prealloc_node, cpu);
+
+    if ( action == CPU_DEAD && *prealloc )
+    {
+        xfree(*prealloc);
+        *prealloc = NULL;
+        avc_cache_stats_incr(frees);
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+    .priority = 99
+};
+
+static int __init cpu_nfb_init(void)
+{
+    register_cpu_notifier(&cpu_nfb);
+    return 0;
+}
+__initcall(cpu_nfb_init);
+
 static struct avc_node *avc_alloc_node(void)
 {
-    struct avc_node *node;
+    struct avc_node *node, **prealloc = &this_cpu(prealloc_node);
 
-    node = xzalloc(struct avc_node);
-    if (!node)
-        goto out;
-
-    INIT_RCU_HEAD(&node->rhead);
-    INIT_HLIST_NODE(&node->list);
-    avc_cache_stats_incr(allocations);
+    node = *prealloc;
+    *prealloc = NULL;
+
+    if ( !node )
+    {
+        /* Must not call xmalloc() & Co with IRQs off. */
+        if ( !local_irq_is_enabled() )
+            goto out;
+        node = new_node();
+        if ( !node )
+            goto out;
+    }
 
     atomic_inc(&avc_cache.active_nodes);
     if ( atomic_read(&avc_cache.active_nodes) > avc_cache_threshold )
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -281,6 +281,16 @@ static int flask_evtchn_send(struct doma
 {
     int rc;
 
+    /*
+     * When called with non-NULL chn, memory allocation may not be permitted.
+     * Allow AVC to preallocate nodes as necessary upon early notification.
+     */
+    if ( !chn )
+    {
+        avc_prealloc();
+        return 0;
+    }
+
     switch ( chn->state )
     {
     case ECS_INTERDOMAIN:
--- a/xen/xsm/flask/include/avc.h
+++ b/xen/xsm/flask/include/avc.h
@@ -91,6 +91,8 @@ int avc_has_perm_noaudit(u32 ssid, u32 t
 int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                                              struct avc_audit_data *auditdata);
 
+void avc_prealloc(void);
+
 /* Exported to selinuxfs */
 struct xen_flask_hash_stats;
 int avc_get_hash_stats(struct xen_flask_hash_stats *arg);
++++++ 5f76caaf-evtchn-FIFO-use-stable-fields.patch ++++++
# Commit 6f6f07b64cbe90e54f8e62b4d6f2404cf5306536
# Date 2020-10-02 08:37:35 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
evtchn/fifo: use stable fields when recording "last queue" information

Both evtchn->priority and evtchn->notify_vcpu_id could change behind the
back of evtchn_fifo_set_pending(), as for it - in the case of
interdomain channels - only the remote side's per-channel lock is held.
Neither the queue's priority nor the vCPU's vcpu_id fields have similar
properties, so they seem better suited for the purpose. In particular
they reflect the respective evtchn fields' values at the time they were
used to determine queue and vCPU.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Julien Grall <jgrall@amazon.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- a/xen/common/event_fifo.c
+++ b/xen/common/event_fifo.c
@@ -225,8 +225,8 @@ static void evtchn_fifo_set_pending(stru
         /* Moved to a different queue? */
         if ( old_q != q )
         {
-            evtchn->last_vcpu_id = evtchn->notify_vcpu_id;
-            evtchn->last_priority = evtchn->priority;
+            evtchn->last_vcpu_id = v->vcpu_id;
+            evtchn->last_priority = q->priority;
 
             spin_unlock_irqrestore(&old_q->lock, flags);
             spin_lock_irqsave(&q->lock, flags);
++++++ 5f897c25-x86-traps-fix-read_registers-for-DF.patch ++++++
# Commit 6065a05adf152a556fb9f11a5218c89e41b62893
# Date 2020-10-16 11:55:33 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/traps: 'Fix' safety of read_registers() in #DF path

All interrupts and exceptions pass a struct cpu_user_regs up into C.  This
contains the legacy vm86 fields from 32bit days, which are beyond the
hardware-pushed frame.

Accessing these fields is generally illegal, as they are logically out of
bounds for anything other than an interrupt/exception hitting ring1/3 code.

show_registers() unconditionally reads these fields, but the content is
discarded before use.  This is benign right now, as all parts of the stack are
readable, including the guard pages.

However, read_registers() in the #DF handler writes to these fields as part of
preparing the state dump, and being IST, hits the adjacent stack frame.

This has been broken forever, but c/s 6001660473 "x86/shstk: Rework the stack
layout to support shadow stacks" repositioned the #DF stack to be adjacent to
the guard page, which turns this OoB write into a fatal pagefault:

  (XEN) *** DOUBLE FAULT ***
  (XEN) ----[ Xen-4.15-unstable  x86_64  debug=y   Tainted:  C   ]----
  (XEN) ----[ Xen-4.15-unstable  x86_64  debug=y   Tainted:  C   ]----
  (XEN) CPU:    4
  (XEN) RIP:    e008:[<ffff82d04031fd4f>] traps.c#read_registers+0x29/0xc1
  (XEN) RFLAGS: 0000000000050086   CONTEXT: hypervisor (d1v0)
  ...
  (XEN) Xen call trace:
  (XEN)    [<ffff82d04031fd4f>] R traps.c#read_registers+0x29/0xc1
  (XEN)    [<ffff82d0403207b3>] F do_double_fault+0x3d/0x7e
  (XEN)    [<ffff82d04039acd7>] F double_fault+0x107/0x110
  (XEN)
  (XEN) Pagetable walk from ffff830236f6d008:
  (XEN)  L4[0x106] = 80000000bfa9b063 ffffffffffffffff
  (XEN)  L3[0x008] = 0000000236ffd063 ffffffffffffffff
  (XEN)  L2[0x1b7] = 0000000236ffc063 ffffffffffffffff
  (XEN)  L1[0x16d] = 8000000236f6d161 ffffffffffffffff
  (XEN)
  (XEN) ****************************************
  (XEN) Panic on CPU 4:
  (XEN) FATAL PAGE FAULT
  (XEN) [error_code=0003]
  (XEN) Faulting linear address: ffff830236f6d008
  (XEN) ****************************************
  (XEN)

and rendering the main #DF analysis broken.

The proper fix is to delete cpu_user_regs.es and later, so no
interrupt/exception path can access OoB, but this needs disentangling from the
PV ABI first.

Not-really-fixes: 6001660473 ("x86/shstk: Rework the stack layout to support shadow stacks")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -770,7 +770,13 @@ void load_system_tables(void)
 	tss->ist[IST_MCE - 1] = stack_top + (1 + IST_MCE) * PAGE_SIZE;
 	tss->ist[IST_NMI - 1] = stack_top + (1 + IST_NMI) * PAGE_SIZE;
 	tss->ist[IST_DB  - 1] = stack_top + (1 + IST_DB)  * PAGE_SIZE;
-	tss->ist[IST_DF  - 1] = stack_top + (1 + IST_DF)  * PAGE_SIZE;
+	/*
+	 * Gross bodge.  The #DF handler uses the vm86 fields of cpu_user_regs
+	 * beyond the hardware frame.  Adjust the stack entrypoint so this
+	 * doesn't manifest as an OoB write which hits the guard page.
+	 */
+	tss->ist[IST_DF  - 1] = stack_top + (1 + IST_DF)  * PAGE_SIZE -
+		(sizeof(struct cpu_user_regs) - offsetof(struct cpu_user_regs, es));
 	tss->bitmap = IOBMP_INVALID_OFFSET;
 
 	/* All other stack pointers poisioned. */
++++++ 5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch ++++++
# Commit a7952a320c1e202a218702bfdb14f75132f04894
# Date 2020-10-16 11:56:59 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/smpboot: Don't unconditionally call memguard_guard_stack() in cpu_smpboot_alloc()

cpu_smpboot_alloc() is designed to be idempotent with respect to partially
initialised state.  This occurs for S3 and CPU parking, where enough state to
handle NMIs/#MCs needs to remain valid for the entire lifetime of Xen, even
when we otherwise want to offline the CPU.

For simplicity between various configuration, Xen always uses shadow stack
mappings (Read-only + Dirty) for the guard page, irrespective of whether
CET-SS is enabled.

Unfortunately, the CET-SS changes in memguard_guard_stack() broke idempotency
by first writing out the supervisor shadow stack tokens with plain writes,
then changing the mapping to being read-only.

This ordering is strictly necessary to configure the BSP, which cannot have
the supervisor tokens be written with WRSS.

Instead of calling memguard_guard_stack() unconditionally, call it only when
actually allocating a new stack.  Xenheap allocates are guaranteed to be
writeable, and the net result is idempotency WRT configuring stack_base[].

Fixes: 91d26ed304f ("x86/shstk: Create shadow stacks")
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -997,16 +997,18 @@ static int cpu_smpboot_alloc(unsigned in
         memflags = MEMF_node(node);
 
     if ( stack_base[cpu] == NULL )
+    {
         stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
-    if ( stack_base[cpu] == NULL )
-        goto out;
+        if ( !stack_base[cpu] )
+            goto out;
+
+        memguard_guard_stack(stack_base[cpu]);
+    }
 
     info = get_cpu_info_from_stack((unsigned long)stack_base[cpu]);
     info->processor_id = cpu;
     info->per_cpu_offset = __per_cpu_offset[cpu];
 
-    memguard_guard_stack(stack_base[cpu]);
-
     gdt = per_cpu(gdt, cpu) ?: alloc_xenheap_pages(0, memflags);
     if ( gdt == NULL )
         goto out;
++++++ 5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch ++++++
# Commit 08e6c6f80b018878476adc2c4e5679d2ce5cb4b1
# Date 2020-10-20 14:19:31 +0200
# Author Wei Liu <wei.liu2@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mm: Refactor map_pages_to_xen to have only a single exit path

We will soon need to perform clean-ups before returning.

No functional change.

This is part of XSA-345.

Reported-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5088,6 +5088,7 @@ int map_pages_to_xen(
     l2_pgentry_t *pl2e, ol2e;
     l1_pgentry_t *pl1e, ol1e;
     unsigned int  i;
+    int rc = -ENOMEM;
 
 #define flush_flags(oldf) do {                 \
     unsigned int o_ = (oldf);                  \
@@ -5108,7 +5109,8 @@ int map_pages_to_xen(
         l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
 
         if ( !pl3e )
-            return -ENOMEM;
+            goto out;
+
         ol3e = *pl3e;
 
         if ( cpu_has_page1gb &&
@@ -5198,7 +5200,7 @@ int map_pages_to_xen(
 
             l2t = alloc_xen_pagetable();
             if ( l2t == NULL )
-                return -ENOMEM;
+                goto out;
 
             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
                 l2e_write(l2t + i,
@@ -5227,7 +5229,7 @@ int map_pages_to_xen(
 
         pl2e = virt_to_xen_l2e(virt);
         if ( !pl2e )
-            return -ENOMEM;
+            goto out;
 
         if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) &
                ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
@@ -5271,7 +5273,7 @@ int map_pages_to_xen(
             {
                 pl1e = virt_to_xen_l1e(virt);
                 if ( pl1e == NULL )
-                    return -ENOMEM;
+                    goto out;
             }
             else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
             {
@@ -5299,7 +5301,7 @@ int map_pages_to_xen(
 
                 l1t = alloc_xen_pagetable();
                 if ( l1t == NULL )
-                    return -ENOMEM;
+                    goto out;
 
                 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
                     l1e_write(&l1t[i],
@@ -5445,7 +5447,10 @@ int map_pages_to_xen(
 
 #undef flush_flags
 
-    return 0;
+    rc = 0;
+
+ out:
+    return rc;
 }
 
 int populate_pt_range(unsigned long virt, unsigned long nr_mfns)
++++++ 5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch ++++++
# Commit b733f8a8b8db83f2d438cab3adb38b387cecfce0
# Date 2020-10-20 14:19:55 +0200
# Author Wei Liu <wei.liu2@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mm: Refactor modify_xen_mappings to have one exit path

We will soon need to perform clean-ups before returning.

No functional change.

This is part of XSA-345.

Reported-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5477,6 +5477,7 @@ int modify_xen_mappings(unsigned long s,
     l1_pgentry_t *pl1e;
     unsigned int  i;
     unsigned long v = s;
+    int rc = -ENOMEM;
 
     /* Set of valid PTE bits which may be altered. */
 #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT)
@@ -5520,7 +5521,8 @@ int modify_xen_mappings(unsigned long s,
             /* PAGE1GB: shatter the superpage and fall through. */
             l2t = alloc_xen_pagetable();
             if ( !l2t )
-                return -ENOMEM;
+                goto out;
+
             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
                 l2e_write(l2t + i,
                           l2e_from_pfn(l3e_get_pfn(*pl3e) +
@@ -5577,7 +5579,8 @@ int modify_xen_mappings(unsigned long s,
                 /* PSE: shatter the superpage and try again. */
                 l1t = alloc_xen_pagetable();
                 if ( !l1t )
-                    return -ENOMEM;
+                    goto out;
+
                 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
                     l1e_write(&l1t[i],
                               l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
@@ -5710,7 +5713,10 @@ int modify_xen_mappings(unsigned long s,
     flush_area(NULL, FLUSH_TLB_GLOBAL);
 
 #undef FLAGS_MASK
-    return 0;
+    rc = 0;
+
+ out:
+    return rc;
 }
 
 #undef flush_area
++++++ 5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch ++++++
# Commit 1ce75e99d75907aaffae05fcf658a833802bce49
# Date 2020-10-20 14:20:19 +0200
# Author Hongyan Xia <hongyxia@amazon.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mm: Prevent some races in hypervisor mapping updates

map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB
superpages if possible, to maximize TLB efficiency.  This means both
replacing superpage entries with smaller entries, and replacing
smaller entries with superpages.

Unfortunately, while some potential races are handled correctly,
others are not.  These include:

1. When one processor modifies a sub-superpage mapping while another
processor replaces the entire range with a superpage.

Take the following example:

Suppose L3[N] points to L2.  And suppose we have two processors, A and
B.

* A walks the pagetables, get a pointer to L2.
* B replaces L3[N] with a 1GiB mapping.
* B Frees L2
* A writes L2[M] #

This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't
handle higher-level superpages properly: If you call virt_xen_to_l2e
on a virtual address within an L3 superpage, you'll either hit a BUG()
(most likely), or get a pointer into the middle of a data page; same
with virt_xen_to_l1 on a virtual address within either an L3 or L2
superpage.

So take the following example:

* A reads pl3e and discovers it to point to an L2.
* B replaces L3[N] with a 1GiB mapping
* A calls virt_to_xen_l2e() and hits the BUG_ON() #

2. When two processors simultaneously try to replace a sub-superpage
mapping with a superpage mapping.

Take the following example:

Suppose L3[N] points to L2.  And suppose we have two processors, A and B,
both trying to replace L3[N] with a superpage.

* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2.
* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2.
* A writes the new value into L3[N]
* B writes the new value into L3[N]
* A recursively frees all the L1's under L2, then frees L2
* B recursively double-frees all the L1's under L2, then double-frees L2 #

Fix this by grabbing a lock for the entirety of the mapping update
operation.

Rather than grabbing map_pgdir_lock for the entire operation, however,
repurpose the PGT_locked bit from L3's page->type_info as a lock.
This means that rather than locking the entire address space, we
"only" lock a single 512GiB chunk of hypervisor address space at a
time.

There was a proposal for a lock-and-reverify approach, where we walk
the pagetables to the point where we decide what to do; then grab the
map_pgdir_lock, re-verify the information we collected without the
lock, and finally make the change (starting over again if anything had
changed).  Without being able to guarantee that the L2 table wasn't
freed, however, that means every read would need to be considered
potentially unsafe.  Thinking carefully about that is probably
something that wants to be done on public, not under time pressure.

This is part of XSA-345.

Reported-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2088,6 +2088,50 @@ void page_unlock(struct page_info *page)
     current_locked_page_set(NULL);
 }
 
+/*
+ * L3 table locks:
+ *
+ * Used for serialization in map_pages_to_xen() and modify_xen_mappings().
+ *
+ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to
+ * reuse the PGT_locked flag. This lock is taken only when we move down to L3
+ * tables and below, since L4 (and above, for 5-level paging) is still globally
+ * protected by map_pgdir_lock.
+ *
+ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock().
+ * This has two implications:
+ * - We cannot reuse reuse current_locked_page_* for debugging
+ * - To avoid the chance of deadlock, even for different pages, we
+ *   must never grab page_lock() after grabbing l3t_lock().  This
+ *   includes any page_lock()-based locks, such as
+ *   mem_sharing_page_lock().
+ *
+ * Also note that we grab the map_pgdir_lock while holding the
+ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
+ * reverse order.
+ */
+static void l3t_lock(struct page_info *page)
+{
+    unsigned long x, nx;
+
+    do {
+        while ( (x = page->u.inuse.type_info) & PGT_locked )
+            cpu_relax();
+        nx = x | PGT_locked;
+    } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
+}
+
+static void l3t_unlock(struct page_info *page)
+{
+    unsigned long x, nx, y = page->u.inuse.type_info;
+
+    do {
+        x = y;
+        BUG_ON(!(x & PGT_locked));
+        nx = x & ~PGT_locked;
+    } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
+}
+
 #ifdef CONFIG_PV
 /*
  * PTE flags that a guest may change without re-validating the PTE.
@@ -5078,6 +5122,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned l
                          flush_area_local((const void *)v, f) : \
                          flush_area_all((const void *)v, f))
 
+#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
+
+#define L3T_LOCK(page)        \
+    do {                      \
+        if ( locking )        \
+            l3t_lock(page);   \
+    } while ( false )
+
+#define L3T_UNLOCK(page)                           \
+    do {                                           \
+        if ( locking && (page) != ZERO_BLOCK_PTR ) \
+        {                                          \
+            l3t_unlock(page);                      \
+            (page) = ZERO_BLOCK_PTR;               \
+        }                                          \
+    } while ( false )
+
 int map_pages_to_xen(
     unsigned long virt,
     mfn_t mfn,
@@ -5089,6 +5150,7 @@ int map_pages_to_xen(
     l1_pgentry_t *pl1e, ol1e;
     unsigned int  i;
     int rc = -ENOMEM;
+    struct page_info *current_l3page;
 
 #define flush_flags(oldf) do {                 \
     unsigned int o_ = (oldf);                  \
@@ -5104,13 +5166,20 @@ int map_pages_to_xen(
     }                                          \
 } while (0)
 
+    L3T_INIT(current_l3page);
+
     while ( nr_mfns != 0 )
     {
-        l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
+        l3_pgentry_t *pl3e, ol3e;
 
+        L3T_UNLOCK(current_l3page);
+
+        pl3e = virt_to_xen_l3e(virt);
         if ( !pl3e )
             goto out;
 
+        current_l3page = virt_to_page(pl3e);
+        L3T_LOCK(current_l3page);
         ol3e = *pl3e;
 
         if ( cpu_has_page1gb &&
@@ -5450,6 +5519,7 @@ int map_pages_to_xen(
     rc = 0;
 
  out:
+    L3T_UNLOCK(current_l3page);
     return rc;
 }
 
@@ -5478,6 +5548,7 @@ int modify_xen_mappings(unsigned long s,
     unsigned int  i;
     unsigned long v = s;
     int rc = -ENOMEM;
+    struct page_info *current_l3page;
 
     /* Set of valid PTE bits which may be altered. */
 #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT)
@@ -5486,11 +5557,22 @@ int modify_xen_mappings(unsigned long s,
     ASSERT(IS_ALIGNED(s, PAGE_SIZE));
     ASSERT(IS_ALIGNED(e, PAGE_SIZE));
 
+    L3T_INIT(current_l3page);
+
     while ( v < e )
     {
-        l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
+        l3_pgentry_t *pl3e;
 
-        if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+        L3T_UNLOCK(current_l3page);
+
+        pl3e = virt_to_xen_l3e(v);
+        if ( !pl3e )
+            goto out;
+
+        current_l3page = virt_to_page(pl3e);
+        L3T_LOCK(current_l3page);
+
+        if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
         {
             /* Confirm the caller isn't trying to create new mappings. */
             ASSERT(!(nf & _PAGE_PRESENT));
@@ -5716,9 +5798,13 @@ int modify_xen_mappings(unsigned long s,
     rc = 0;
 
  out:
+    L3T_UNLOCK(current_l3page);
     return rc;
 }
 
+#undef L3T_LOCK
+#undef L3T_UNLOCK
+
 #undef flush_area
 
 int destroy_xen_mappings(unsigned long s, unsigned long e)
++++++ 5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch ++++++
# Commit dea460d86957bf1425a8a1572626099ac3f165a8
# Date 2020-10-20 14:21:09 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page

Deferring flushes to a single, wide range one - as is done when
handling XENMAPSPACE_gmfn_range - is okay only as long as
pages don't get freed ahead of the eventual flush. While the only
function setting the flag (xenmem_add_to_physmap()) suggests by its name
that it's only mapping new entries, in reality the way
xenmem_add_to_physmap_one() works means an unmap would happen not only
for the page being moved (but not freed) but, if the destination GFN is
populated, also for the page being displaced from that GFN. Collapsing
the two flushes for this GFN into just one (end even more so deferring
it to a batched invocation) is not correct.

This is part of XSA-346.

Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Acked-by: Julien Grall <jgrall@amazon.com>

--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -293,6 +293,7 @@ int guest_remove_page(struct domain *d,
     p2m_type_t p2mt;
 #endif
     mfn_t mfn;
+    bool *dont_flush_p, dont_flush;
     int rc;
 
 #ifdef CONFIG_X86
@@ -379,8 +380,18 @@ int guest_remove_page(struct domain *d,
         return -ENXIO;
     }
 
+    /*
+     * Since we're likely to free the page below, we need to suspend
+     * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes.
+     */
+    dont_flush_p = &this_cpu(iommu_dont_flush_iotlb);
+    dont_flush = *dont_flush_p;
+    *dont_flush_p = false;
+
     rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
 
+    *dont_flush_p = dont_flush;
+
     /*
      * With the lack of an IOMMU on some platforms, domains with DMA-capable
      * device must retrieve the same pfn when the hypercall populate_physmap
++++++ 5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch ++++++
# Commit 5777a3742d88ff1c0ebc626ceb4fd47f9b3dc6d5
# Date 2020-10-20 14:21:32 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
IOMMU: hold page ref until after deferred TLB flush

When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB
flush for the "from" GFN range requires that the page remains allocated
to the guest until the TLB flush has actually occurred. Otherwise a
parallel hypercall to remove the page would only flush the TLB for the
GFN it has been moved to, but not the one is was mapped at originally.

This is part of XSA-346.

Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
Reported-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>

--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gfn)
 {
@@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one(
         break;
     }
     case XENMAPSPACE_dev_mmio:
-        /* extra should be 0. Reserved for future use. */
-        if ( extra.res0 )
-            return -EOPNOTSUPP;
-
         rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx));
         return rc;
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4550,7 +4550,7 @@ static int handle_iomem_range(unsigned l
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gpfn)
 {
@@ -4634,9 +4634,20 @@ int xenmem_add_to_physmap_one(
         rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
 
  put_both:
-    /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */
+    /*
+     * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top.
+     * We also may need to transfer ownership of the page reference to our
+     * caller.
+     */
     if ( space == XENMAPSPACE_gmfn )
+    {
         put_gfn(d, gfn);
+        if ( !rc && extra.ppage )
+        {
+            *extra.ppage = page;
+            page = NULL;
+        }
+    }
 
     if ( page )
         put_page(page);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -815,13 +815,12 @@ int xenmem_add_to_physmap(struct domain
 {
     unsigned int done = 0;
     long rc = 0;
-    union xen_add_to_physmap_batch_extra extra;
+    union add_to_physmap_extra extra = {};
+    struct page_info *pages[16];
 
     ASSERT(paging_mode_translate(d));
 
-    if ( xatp->space != XENMAPSPACE_gmfn_foreign )
-        extra.res0 = 0;
-    else
+    if ( xatp->space == XENMAPSPACE_gmfn_foreign )
         extra.foreign_domid = DOMID_INVALID;
 
     if ( xatp->space != XENMAPSPACE_gmfn_range )
@@ -836,7 +835,10 @@ int xenmem_add_to_physmap(struct domain
     xatp->size -= start;
 
     if ( is_iommu_enabled(d) )
+    {
        this_cpu(iommu_dont_flush_iotlb) = 1;
+       extra.ppage = &pages[0];
+    }
 
     while ( xatp->size > done )
     {
@@ -848,8 +850,12 @@ int xenmem_add_to_physmap(struct domain
         xatp->idx++;
         xatp->gpfn++;
 
+        if ( extra.ppage )
+            ++extra.ppage;
+
         /* Check for continuation if it's not the last iteration. */
-        if ( xatp->size > ++done && hypercall_preempt_check() )
+        if ( (++done > ARRAY_SIZE(pages) && extra.ppage) ||
+             (xatp->size > done && hypercall_preempt_check()) )
         {
             rc = start + done;
             break;
@@ -859,6 +865,7 @@ int xenmem_add_to_physmap(struct domain
     if ( is_iommu_enabled(d) )
     {
         int ret;
+        unsigned int i;
 
         this_cpu(iommu_dont_flush_iotlb) = 0;
 
@@ -867,6 +874,15 @@ int xenmem_add_to_physmap(struct domain
         if ( unlikely(ret) && rc >= 0 )
             rc = ret;
 
+        /*
+         * Now that the IOMMU TLB flush was done for the original GFN, drop
+         * the page references. The 2nd flush below is fine to make later, as
+         * whoever removes the page again from its new GFN will have to do
+         * another flush anyway.
+         */
+        for ( i = 0; i < done; ++i )
+            put_page(pages[i]);
+
         ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
                                 IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
         if ( unlikely(ret) && rc >= 0 )
@@ -880,6 +896,8 @@ static int xenmem_add_to_physmap_batch(s
                                        struct xen_add_to_physmap_batch *xatpb,
                                        unsigned int extent)
 {
+    union add_to_physmap_extra extra = {};
+
     if ( unlikely(xatpb->size < extent) )
         return -EILSEQ;
 
@@ -891,6 +909,19 @@ static int xenmem_add_to_physmap_batch(s
          !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
         return -EFAULT;
 
+    switch ( xatpb->space )
+    {
+    case XENMAPSPACE_dev_mmio:
+        /* res0 is reserved for future use. */
+        if ( xatpb->u.res0 )
+            return -EOPNOTSUPP;
+        break;
+
+    case XENMAPSPACE_gmfn_foreign:
+        extra.foreign_domid = xatpb->u.foreign_domid;
+        break;
+    }
+
     while ( xatpb->size > extent )
     {
         xen_ulong_t idx;
@@ -903,8 +934,7 @@ static int xenmem_add_to_physmap_batch(s
                                                extent, 1)) )
             return -EFAULT;
 
-        rc = xenmem_add_to_physmap_one(d, xatpb->space,
-                                       xatpb->u,
+        rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
                                        idx, _gfn(gpfn));
 
         if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -592,8 +592,22 @@ void scrub_one_page(struct page_info *);
     page_list_del(pg, page_to_list(d, pg))
 #endif
 
+union add_to_physmap_extra {
+    /*
+     * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs
+     * to be kept until after the flush, so the page can't get removed from
+     * the domain (and re-used for another purpose) beforehand. By passing
+     * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants
+     * to have ownership of such a reference transferred in the success case.
+     */
+    struct page_info **ppage;
+
+    /* XENMAPSPACE_gmfn_foreign */
+    domid_t foreign_domid;
+};
+
 int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
-                              union xen_add_to_physmap_batch_extra extra,
+                              union add_to_physmap_extra extra,
                               unsigned long idx, gfn_t gfn);
 
 int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
++++++ 5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch ++++++
# Commit 73f62c7380edf07469581a3049aba98abd63b275
# Date 2020-10-20 14:22:26 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
AMD/IOMMU: convert amd_iommu_pte from struct to union

This is to add a "raw" counterpart to the bitfield equivalent. Take the
opportunity and
 - convert fields to bool / unsigned int,
 - drop the naming of the reserved field,
 - shorten the names of the ignored ones.

This is part of XSA-347.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- a/xen/drivers/passthrough/amd/iommu-defs.h
+++ b/xen/drivers/passthrough/amd/iommu-defs.h
@@ -451,20 +451,23 @@ union amd_iommu_x2apic_control {
 #define IOMMU_PAGE_TABLE_U32_PER_ENTRY	(IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
 #define IOMMU_PAGE_TABLE_ALIGNMENT	4096
 
-struct amd_iommu_pte {
-    uint64_t pr:1;
-    uint64_t ignored0:4;
-    uint64_t a:1;
-    uint64_t d:1;
-    uint64_t ignored1:2;
-    uint64_t next_level:3;
-    uint64_t mfn:40;
-    uint64_t reserved:7;
-    uint64_t u:1;
-    uint64_t fc:1;
-    uint64_t ir:1;
-    uint64_t iw:1;
-    uint64_t ignored2:1;
+union amd_iommu_pte {
+    uint64_t raw;
+    struct {
+        bool pr:1;
+        unsigned int ign0:4;
+        bool a:1;
+        bool d:1;
+        unsigned int ign1:2;
+        unsigned int next_level:3;
+        uint64_t mfn:40;
+        unsigned int :7;
+        bool u:1;
+        bool fc:1;
+        bool ir:1;
+        bool iw:1;
+        unsigned int ign2:1;
+    };
 };
 
 /* Paging modes */
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -34,7 +34,7 @@ static unsigned int pfn_to_pde_idx(unsig
 static unsigned int clear_iommu_pte_present(unsigned long l1_mfn,
                                             unsigned long dfn)
 {
-    struct amd_iommu_pte *table, *pte;
+    union amd_iommu_pte *table, *pte;
     unsigned int flush_flags;
 
     table = map_domain_page(_mfn(l1_mfn));
@@ -48,7 +48,7 @@ static unsigned int clear_iommu_pte_pres
     return flush_flags;
 }
 
-static unsigned int set_iommu_pde_present(struct amd_iommu_pte *pte,
+static unsigned int set_iommu_pde_present(union amd_iommu_pte *pte,
                                           unsigned long next_mfn,
                                           unsigned int next_level, bool iw,
                                           bool ir)
@@ -83,7 +83,7 @@ static unsigned int set_iommu_pte_presen
                                           int pde_level,
                                           bool iw, bool ir)
 {
-    struct amd_iommu_pte *table, *pde;
+    union amd_iommu_pte *table, *pde;
     unsigned int flush_flags;
 
     table = map_domain_page(_mfn(pt_mfn));
@@ -174,7 +174,7 @@ void iommu_dte_set_guest_cr3(struct amd_
 static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn,
                               unsigned long pt_mfn[], bool map)
 {
-    struct amd_iommu_pte *pde, *next_table_vaddr;
+    union amd_iommu_pte *pde, *next_table_vaddr;
     unsigned long  next_table_mfn;
     unsigned int level;
     struct page_info *table;
@@ -448,7 +448,7 @@ int __init amd_iommu_quarantine_init(str
     unsigned long end_gfn =
         1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
     unsigned int level = amd_iommu_get_paging_mode(end_gfn);
-    struct amd_iommu_pte *table;
+    union amd_iommu_pte *table;
 
     if ( hd->arch.root_table )
     {
@@ -479,7 +479,7 @@ int __init amd_iommu_quarantine_init(str
 
         for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
         {
-            struct amd_iommu_pte *pde = &table[i];
+            union amd_iommu_pte *pde = &table[i];
 
             /*
              * PDEs are essentially a subset of PTEs, so this function
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -387,7 +387,7 @@ static void deallocate_next_page_table(s
 
 static void deallocate_page_table(struct page_info *pg)
 {
-    struct amd_iommu_pte *table_vaddr;
+    union amd_iommu_pte *table_vaddr;
     unsigned int index, level = PFN_ORDER(pg);
 
     PFN_ORDER(pg) = 0;
@@ -402,7 +402,7 @@ static void deallocate_page_table(struct
 
     for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
     {
-        struct amd_iommu_pte *pde = &table_vaddr[index];
+        union amd_iommu_pte *pde = &table_vaddr[index];
 
         if ( pde->mfn && pde->next_level && pde->pr )
         {
@@ -554,7 +554,7 @@ static void amd_dump_p2m_table_level(str
                                      paddr_t gpa, int indent)
 {
     paddr_t address;
-    struct amd_iommu_pte *table_vaddr;
+    const union amd_iommu_pte *table_vaddr;
     int index;
 
     if ( level < 1 )
@@ -570,7 +570,7 @@ static void amd_dump_p2m_table_level(str
 
     for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
     {
-        struct amd_iommu_pte *pde = &table_vaddr[index];
+        const union amd_iommu_pte *pde = &table_vaddr[index];
 
         if ( !(index % 2) )
             process_pending_softirqs();
++++++ 5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch ++++++
# Commit 3b055121c5410e2c3105d6d06aa24ca0d58868cd
# Date 2020-10-20 14:22:52 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
AMD/IOMMU: update live PTEs atomically

Updating a live PTE bitfield by bitfield risks the compiler re-ordering
the individual updates as well as splitting individual updates into
multiple memory writes. Construct the new entry fully in a local
variable, do the check to determine the flushing needs on the thus
established new entry, and then write the new entry by a single insn.

Similarly using memset() to clear a PTE is unsafe, as the order of
writes the function does is, at least in principle, undefined.

This is part of XSA-347.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -41,7 +41,7 @@ static unsigned int clear_iommu_pte_pres
     pte = &table[pfn_to_pde_idx(dfn, 1)];
 
     flush_flags = pte->pr ? IOMMU_FLUSHF_modified : 0;
-    memset(pte, 0, sizeof(*pte));
+    write_atomic(&pte->raw, 0);
 
     unmap_domain_page(table);
 
@@ -53,26 +53,30 @@ static unsigned int set_iommu_pde_presen
                                           unsigned int next_level, bool iw,
                                           bool ir)
 {
+    union amd_iommu_pte new = {}, old;
     unsigned int flush_flags = IOMMU_FLUSHF_added;
 
-    if ( pte->pr &&
-         (pte->mfn != next_mfn ||
-          pte->iw != iw ||
-          pte->ir != ir ||
-          pte->next_level != next_level) )
-            flush_flags |= IOMMU_FLUSHF_modified;
-
     /*
      * FC bit should be enabled in PTE, this helps to solve potential
      * issues with ATS devices
      */
-    pte->fc = !next_level;
+    new.fc = !next_level;
+
+    new.mfn = next_mfn;
+    new.iw = iw;
+    new.ir = ir;
+    new.next_level = next_level;
+    new.pr = true;
+
+    old.raw = read_atomic(&pte->raw);
+    old.ign0 = 0;
+    old.ign1 = 0;
+    old.ign2 = 0;
+
+    if ( old.pr && old.raw != new.raw )
+        flush_flags |= IOMMU_FLUSHF_modified;
 
-    pte->mfn = next_mfn;
-    pte->iw = iw;
-    pte->ir = ir;
-    pte->next_level = next_level;
-    pte->pr = 1;
+    write_atomic(&pte->raw, new.raw);
 
     return flush_flags;
 }
++++++ 5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch ++++++
# Commit 0514a3a25fb9ebff5d75cc8f00a9229385300858
# Date 2020-10-20 14:23:12 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
AMD/IOMMU: ensure suitable ordering of DTE modifications

DMA and interrupt translation should be enabled only after other
applicable DTE fields have been written. Similarly when disabling
translation or when moving a device between domains, translation should
first be disabled, before other entry fields get modified. Note however
that the "moving" aspect doesn't apply to the interrupt remapping side,
as domain specifics are maintained in the IRTEs here, not the DTE. We
also never disable interrupt remapping once it got enabled for a device
(the respective argument passed is always the immutable iommu_intremap).

This is part of XSA-347.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -103,11 +103,18 @@ void amd_iommu_set_root_page_table(struc
                                    uint64_t root_ptr, uint16_t domain_id,
                                    uint8_t paging_mode, bool valid)
 {
+    if ( valid || dte->v )
+    {
+        dte->tv = false;
+        dte->v = true;
+        smp_wmb();
+    }
     dte->domain_id = domain_id;
     dte->pt_root = paddr_to_pfn(root_ptr);
     dte->iw = true;
     dte->ir = true;
     dte->paging_mode = paging_mode;
+    smp_wmb();
     dte->tv = true;
     dte->v = valid;
 }
@@ -130,6 +137,7 @@ void amd_iommu_set_intremap_table(
     }
 
     dte->ig = false; /* unmapped interrupts result in i/o page faults */
+    smp_wmb();
     dte->iv = valid;
 }
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -117,7 +117,10 @@ static void amd_iommu_setup_domain_devic
         /* Undo what amd_iommu_disable_domain_device() may have done. */
         ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
         if ( dte->it_root )
+        {
             dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+            smp_wmb();
+        }
         dte->iv = iommu_intremap;
         dte->ex = ivrs_dev->dte_allow_exclusion;
         dte->sys_mgt = MASK_EXTR(ivrs_dev->device_flags, ACPI_IVHD_SYSTEM_MGMT);
++++++ libxc.sr.superpage.patch ++++++
--- /var/tmp/diff_new_pack.AogIHC/_old	2020-11-02 09:38:45.557515843 +0100
+++ /var/tmp/diff_new_pack.AogIHC/_new	2020-11-02 09:38:45.557515843 +0100
@@ -482,7 +482,7 @@
  
      free(ctx->x86.restore.cpuid.ptr);
      free(ctx->x86.restore.msr.ptr);
-@@ -249,6 +277,318 @@ static int x86_hvm_cleanup(struct xc_sr_
+@@ -249,6 +277,322 @@ static int x86_hvm_cleanup(struct xc_sr_
      return 0;
  }
  
@@ -601,8 +601,10 @@
 +    sp->count = 1ULL << order;
 +
 +    /* Allocate only if there is room for another page */
-+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
++    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) {
++        errno = E2BIG;
 +        return false;
++    }
 +
 +    extent = sp->base_pfn = (sp->pfn >> order) << order;
 +    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
@@ -610,8 +612,10 @@
 +        PERROR("populate_physmap failed.");
 +        return false;
 +    }
-+    if ( done == 0 )
++    if ( done == 0 ) {
++        errno = ENOMEM;
 +        return false;
++    }
 +
 +    DPRINTF("4K base_pfn %" PRI_xen_pfn "\n", sp->base_pfn);
 +    return true;

++++++ xsa286-1.patch ++++++
x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vs
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;
++++++ xsa286-2.patch ++++++
x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pa
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pa
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*
++++++ xsa286-3.patch ++++++
x86/mm: avoid using linear page tables in map_guest_l1e()

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned lon
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pa
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -579,7 +579,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);
++++++ xsa286-4.patch ++++++
x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned lon
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned lon
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -349,7 +349,7 @@ int pv_ro_page_fault(unsigned long addr,
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -582,6 +582,7 @@ int vcpu_destroy_pagetables(struct vcpu
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);
++++++ xsa286-5.patch ++++++
x86/mm: avoid using top level linear page tables in {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(st
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6061,6 +6061,10 @@ void free_perdomain_mappings(struct doma
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);
++++++ xsa286-6.patch ++++++
x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1682,9 +1682,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -139,6 +139,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -808,9 +808,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -199,7 +199,7 @@ extern unsigned char boot_edid_info[128]
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -294,19 +294,6 @@ void copy_page_sse2(void *, const void *
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;