Hello community, here is the log from the commit of package xen for openSUSE:Factory checked in at 2020-11-02 09:38:21 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/xen (Old) and /work/SRC/openSUSE:Factory/.xen.new.3463 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Package is "xen" Mon Nov 2 09:38:21 2020 rev:292 rq:844630 version:4.14.0_10 Changes: -------- --- /work/SRC/openSUSE:Factory/xen/xen.changes 2020-09-25 16:21:30.839358028 +0200 +++ /work/SRC/openSUSE:Factory/.xen.new.3463/xen.changes 2020-11-02 09:38:43.337513714 +0100 @@ -1,0 +2,81 @@ +Wed Oct 21 09:34:32 MDT 2020 - carnold@suse.com + +- Upstream bug fixes (bsc#1027519) + 5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch + 5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch + 5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch + 5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch + 5f5b6b7a-hypfs-fix-custom-param-writes.patch + 5f607915-x86-HVM-more-consistent-IO-completion.patch + 5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch + 5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch + 5f71a21e-x86-S3-fix-shadow-stack-resume.patch + 5f76ca65-evtchn-Flask-prealloc-for-send.patch + 5f76caaf-evtchn-FIFO-use-stable-fields.patch + 5f897c25-x86-traps-fix-read_registers-for-DF.patch + 5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch +- Renamed patches + 5f560c42-x86-PV-64bit-segbase-consistency.patch + Replaces 5f5b6951-x86-PV-64bit-segbase-consistency.patch + 5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch + Replaces 5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch + 5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch + Replaces 5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch + 5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch + Replaces 5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch + 5f6a008e-x86-MSI-drop-read_msi_msg.patch + Replaces 5f6a05fa-msi-get-rid-of-read_msi_msg.patch + 5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch + Replaces 5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch + 5f6a00c4-evtchn-relax-port_is_valid.patch + Replaces 5f6a062c-evtchn-relax-port_is_valid.patch + 5f6a00df-x86-PV-avoid-double-exception-injection.patch + Replaces 5f6a065c-pv-Avoid-double-exception-injection.patch + 5f6a00f4-evtchn-add-missing-barriers.patch + Replaces 5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch + 5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch + Replaces 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch + 5f6a013f-evtchn_reset-shouldnt-succeed-with.patch + Replaces 5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch + 5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch + Replaces 5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch + 5f6a0178-evtchn-address-races-with-evtchn_reset.patch + Replaces 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch + 5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch + Replaces 5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch + 5f6a01c6-evtchn-preempt-in-evtchn_reset.patch + Replaces 5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch + +------------------------------------------------------------------- +Tue Oct 13 10:48:04 MDT 2020 - carnold@suse.com + +- bsc#1177409 - VUL-0: xen: x86 PV guest INVLPG-like flushes may + leave stale TLB entries (XSA-286) + xsa286-1.patch + xsa286-2.patch + xsa286-3.patch + xsa286-4.patch + xsa286-5.patch + xsa286-6.patch +- bsc#1177412 - VUL-0: xen: Race condition in Xen mapping code + (XSA-345) + 5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch + 5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch + 5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch +- bsc#1177413 - VUL-0: xen: undue deferral of IOMMU TLB flushes + (XSA-346) + 5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch + 5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch +- bsc#1177414 - VUL-0: xen: unsafe AMD IOMMU page table updates + (XSA-347) + 5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch + 5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch + 5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch + +------------------------------------------------------------------- +Mon Oct 12 10:10:10 UTC 2020 - ohering@suse.de + +- Update libxc.sr.superpage.patch + set errno in x86_hvm_alloc_4k (bsc#1177112) + +------------------------------------------------------------------- Old: ---- 5f5b6951-x86-PV-64bit-segbase-consistency.patch 5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch 5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch 5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch 5f6a05fa-msi-get-rid-of-read_msi_msg.patch 5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch 5f6a062c-evtchn-relax-port_is_valid.patch 5f6a065c-pv-Avoid-double-exception-injection.patch 5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch 5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch 5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch 5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch 5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch New: ---- 5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch 5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch 5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch 5f560c42-x86-PV-64bit-segbase-consistency.patch 5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch 5f5b6b7a-hypfs-fix-custom-param-writes.patch 5f607915-x86-HVM-more-consistent-IO-completion.patch 5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch 5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch 5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch 5f6a008e-x86-MSI-drop-read_msi_msg.patch 5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch 5f6a00c4-evtchn-relax-port_is_valid.patch 5f6a00df-x86-PV-avoid-double-exception-injection.patch 5f6a00f4-evtchn-add-missing-barriers.patch 5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch 5f6a013f-evtchn_reset-shouldnt-succeed-with.patch 5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch 5f6a0178-evtchn-address-races-with-evtchn_reset.patch 5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch 5f6a01c6-evtchn-preempt-in-evtchn_reset.patch 5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch 5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch 5f71a21e-x86-S3-fix-shadow-stack-resume.patch 5f76ca65-evtchn-Flask-prealloc-for-send.patch 5f76caaf-evtchn-FIFO-use-stable-fields.patch 5f897c25-x86-traps-fix-read_registers-for-DF.patch 5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch 5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch 5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch 5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch 5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch 5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch 5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch 5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch 5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch xsa286-1.patch xsa286-2.patch xsa286-3.patch xsa286-4.patch xsa286-5.patch xsa286-6.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ xen.spec ++++++ --- /var/tmp/diff_new_pack.AogIHC/_old 2020-11-02 09:38:45.029515337 +0100 +++ /var/tmp/diff_new_pack.AogIHC/_new 2020-11-02 09:38:45.033515341 +0100 @@ -125,7 +125,7 @@ BuildRequires: pesign-obs-integration %endif -Version: 4.14.0_08 +Version: 4.14.0_10 Release: 0 Summary: Xen Virtualization: Hypervisor (aka VMM aka Microkernel) License: GPL-2.0-only @@ -165,21 +165,48 @@ # Upstream patches Patch1: 5f1a9916-x86-S3-put-data-sregs-into-known-state.patch Patch2: 5f21b9fd-x86-cpuid-APIC-bit-clearing.patch -Patch3: 5f5b6951-x86-PV-64bit-segbase-consistency.patch -Patch4: 5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch -Patch5: 5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch -Patch6: 5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch -Patch7: 5f6a05fa-msi-get-rid-of-read_msi_msg.patch -Patch8: 5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch -Patch9: 5f6a062c-evtchn-relax-port_is_valid.patch -Patch10: 5f6a065c-pv-Avoid-double-exception-injection.patch -Patch11: 5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch -Patch12: 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch -Patch13: 5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch -Patch14: 5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch -Patch15: 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch -Patch16: 5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch -Patch17: 5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch +Patch3: 5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch +Patch4: 5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch +Patch5: 5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch +Patch6: 5f560c42-x86-PV-64bit-segbase-consistency.patch +Patch7: 5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch +Patch8: 5f5b6b7a-hypfs-fix-custom-param-writes.patch +Patch9: 5f607915-x86-HVM-more-consistent-IO-completion.patch +Patch10: 5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch +Patch11: 5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch +Patch12: 5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch +Patch13: 5f6a008e-x86-MSI-drop-read_msi_msg.patch +Patch14: 5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch +Patch15: 5f6a00c4-evtchn-relax-port_is_valid.patch +Patch16: 5f6a00df-x86-PV-avoid-double-exception-injection.patch +Patch17: 5f6a00f4-evtchn-add-missing-barriers.patch +Patch18: 5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch +Patch19: 5f6a013f-evtchn_reset-shouldnt-succeed-with.patch +Patch20: 5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch +Patch21: 5f6a0178-evtchn-address-races-with-evtchn_reset.patch +Patch22: 5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch +Patch23: 5f6a01c6-evtchn-preempt-in-evtchn_reset.patch +Patch24: 5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch +Patch25: 5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch +Patch26: 5f71a21e-x86-S3-fix-shadow-stack-resume.patch +Patch27: 5f76ca65-evtchn-Flask-prealloc-for-send.patch +Patch28: 5f76caaf-evtchn-FIFO-use-stable-fields.patch +Patch29: 5f897c25-x86-traps-fix-read_registers-for-DF.patch +Patch30: 5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch +Patch31: 5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch +Patch32: 5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch +Patch33: 5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch +Patch34: 5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch +Patch35: 5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch +Patch36: 5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch +Patch37: 5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch +Patch38: 5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch +Patch28601: xsa286-1.patch +Patch28602: xsa286-2.patch +Patch28603: xsa286-3.patch +Patch28604: xsa286-4.patch +Patch28605: xsa286-5.patch +Patch28606: xsa286-6.patch # Our platform specific patches Patch400: xen-destdir.patch Patch401: vif-bridge-no-iptables.patch @@ -258,8 +285,8 @@ This package contains the libraries used to interact with the Xen virtual machine monitor. -In addition to this package you need to install kernel-xen, xen and -xen-tools to use Xen. +In addition to this package you need to install xen and xen-tools +to use Xen. Authors: @@ -304,8 +331,8 @@ This package contains the control tools that allow you to start, stop, migrate, and manage virtual machines. -In addition to this package you need to install kernel-xen, xen and -xen-libs to use Xen. +In addition to this package you need to install xen and xen-libs +to use Xen. Authors: @@ -424,6 +451,33 @@ %patch15 -p1 %patch16 -p1 %patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 +%patch21 -p1 +%patch22 -p1 +%patch23 -p1 +%patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch28601 -p1 +%patch28602 -p1 +%patch28603 -p1 +%patch28604 -p1 +%patch28605 -p1 +%patch28606 -p1 # Our platform specific patches %patch400 -p1 %patch401 -p1 ++++++ 5f479d9e-x86-begin-to-support-MSR_ARCH_CAPS.patch ++++++ # Commit e32605b07ef2e01c9d05da9b2d5d7b8f9a5c7c1b # Date 2020-08-27 12:48:46 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86: Begin to introduce support for MSR_ARCH_CAPS ... including serialisation/deserialisation logic and unit tests. There is no current way to configure this MSR correctly for guests. The toolstack side this logic needs building, which is far easier to do with it in place. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/tools/tests/cpu-policy/test-cpu-policy.c +++ b/tools/tests/cpu-policy/test-cpu-policy.c @@ -374,6 +374,11 @@ static void test_msr_deserialise_failure .msr = { .idx = 0xce, .val = ~0ull }, .rc = -EOVERFLOW, }, + { + .name = "truncated val", + .msr = { .idx = 0x10a, .val = ~0ull }, + .rc = -EOVERFLOW, + }, }; printf("Testing MSR deserialise failure:\n"); --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -220,8 +220,10 @@ int guest_rdmsr(struct vcpu *v, uint32_t break; case MSR_ARCH_CAPABILITIES: - /* Not implemented yet. */ - goto gp_fault; + if ( !cp->feat.arch_caps ) + goto gp_fault; + *val = mp->arch_caps.raw; + break; case MSR_INTEL_MISC_FEATURES_ENABLES: *val = msrs->misc_features_enables.raw; --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -267,7 +267,7 @@ XEN_CPUFEATURE(CET_IBT, 9*32+20) / XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */ XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */ XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */ -XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */ +XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /*a IA32_ARCH_CAPABILITIES MSR */ XEN_CPUFEATURE(CORE_CAPS, 9*32+30) /* IA32_CORE_CAPABILITIES MSR */ XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */ --- a/xen/include/xen/lib/x86/msr.h +++ b/xen/include/xen/lib/x86/msr.h @@ -3,7 +3,7 @@ #define XEN_LIB_X86_MSR_H /* Maximum number of MSRs written when serialising msr_policy. */ -#define MSR_MAX_SERIALISED_ENTRIES 1 +#define MSR_MAX_SERIALISED_ENTRIES 2 /* MSR policy object for shared per-domain MSRs */ struct msr_policy @@ -23,6 +23,28 @@ struct msr_policy bool cpuid_faulting:1; }; } platform_info; + + /* + * 0x0000010a - MSR_ARCH_CAPABILITIES + * + * This is an Intel-only MSR, which provides miscellaneous enumeration, + * including those which indicate that microarchitectrual sidechannels are + * fixed in hardware. + */ + union { + uint32_t raw; + struct { + bool rdcl_no:1; + bool ibrs_all:1; + bool rsba:1; + bool skip_l1dfl:1; + bool ssb_no:1; + bool mds_no:1; + bool if_pschange_mc_no:1; + bool tsx_ctrl:1; + bool taa_no:1; + }; + } arch_caps; }; #ifdef __XEN__ --- a/xen/lib/x86/msr.c +++ b/xen/lib/x86/msr.c @@ -39,6 +39,7 @@ int x86_msr_copy_to_buffer(const struct }) COPY_MSR(MSR_INTEL_PLATFORM_INFO, p->platform_info.raw); + COPY_MSR(MSR_ARCH_CAPABILITIES, p->arch_caps.raw); #undef COPY_MSR @@ -99,6 +100,7 @@ int x86_msr_copy_from_buffer(struct msr_ }) case MSR_INTEL_PLATFORM_INFO: ASSIGN(platform_info.raw); break; + case MSR_ARCH_CAPABILITIES: ASSIGN(arch_caps.raw); break; #undef ASSIGN ++++++ 5f4cf06e-x86-Dom0-expose-MSR_ARCH_CAPS.patch ++++++ # Commit e46474278a0e87e2b32ad5dd5fc20e8d2cb0688b # Date 2020-08-31 13:43:26 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/intel: Expose MSR_ARCH_CAPS to dom0 The overhead of (the lack of) MDS_NO alone has been measured at 30% on some workloads. While we're not in a position yet to offer MSR_ARCH_CAPS generally to guests, dom0 doesn't migrate, so we can pass a subset of hardware values straight through. This will cause PVH dom0's not to use KPTI by default, and all dom0's not to use VERW flushing by default, and to use eIBRS in preference to retpoline on recent Intel CPUs. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/cpuid.c +++ b/xen/arch/x86/cpuid.c @@ -719,6 +719,14 @@ int init_domain_cpuid_policy(struct doma if ( d->disable_migrate ) p->extd.itsc = cpu_has_itsc; + /* + * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0, + * so dom0 can turn off workarounds as appropriate. Temporary, until the + * domain policy logic gains a better understanding of MSRs. + */ + if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) + p->feat.arch_caps = true; + d->arch.cpuid = p; recalculate_cpuid_policy(d); --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -130,6 +130,22 @@ int init_domain_msr_policy(struct domain if ( !opt_dom0_cpuid_faulting && is_control_domain(d) && is_pv_domain(d) ) mp->platform_info.cpuid_faulting = false; + /* + * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0, + * so dom0 can turn off workarounds as appropriate. Temporary, until the + * domain policy logic gains a better understanding of MSRs. + */ + if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) + { + uint64_t val; + + rdmsrl(MSR_ARCH_CAPABILITIES, val); + + mp->arch_caps.raw = val & + (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA | + ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO); + } + d->arch.msr = mp; return 0; ++++++ 5f4cf96a-x86-PV-fix-SEGBASE_GS_USER_SEL.patch ++++++ # Commit afe018e041ec112d90a8b4e6ed607d22aa06f280 # Date 2020-08-31 14:21:46 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Fix multiple bugs with SEGBASE_GS_USER_SEL The logic takes the segment selector unmodified from guest context. This allowed the guest to load DPL0 descriptors into %gs. Fix up the RPL for non-NUL selectors to be 3. Xen's context switch logic skips saving the inactive %gs base, as it cannot be modified by the guest behind Xen's back. This depends on Xen caching updates to the inactive base, which is was missing from this path. The consequence is that, following SEGBASE_GS_USER_SEL, the next context switch will restore the stale inactive %gs base, and corrupt vcpu state. Rework the hypercall to update the cached idea of gs_base_user, and fix the behaviour in the case of the AMD NUL selector bug to always zero the segment base. Reported-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -1031,17 +1031,54 @@ long do_set_segment_base(unsigned int wh break; case SEGBASE_GS_USER_SEL: - __asm__ __volatile__ ( - " swapgs \n" - "1: movl %k0,%%gs \n" - " "safe_swapgs" \n" - ".section .fixup,\"ax\" \n" - "2: xorl %k0,%k0 \n" - " jmp 1b \n" - ".previous \n" - _ASM_EXTABLE(1b, 2b) - : "+r" (base) ); + { + unsigned int sel = (uint16_t)base; + + /* + * We wish to update the user %gs from the GDT/LDT. Currently, the + * guest kernel's GS_BASE is in context. + */ + asm volatile ( "swapgs" ); + + if ( sel > 3 ) + /* Fix up RPL for non-NUL selectors. */ + sel |= 3; + else if ( boot_cpu_data.x86_vendor & + (X86_VENDOR_AMD | X86_VENDOR_HYGON) ) + /* Work around NUL segment behaviour on AMD hardware. */ + asm volatile ( "mov %[sel], %%gs" + :: [sel] "r" (FLAT_USER_DS32) ); + + /* + * Load the chosen selector, with fault handling. + * + * Errors ought to fail the hypercall, but that was never built in + * originally, and Linux will BUG() if this call fails. + * + * NUL the selector in the case of an error. This too needs to deal + * with the AMD NUL segment behaviour, but it is already a slowpath in + * #GP context so perform the flat load unconditionally to avoid + * complicated logic. + * + * Anyone wanting to check for errors from this hypercall should + * re-read %gs and compare against the input. + */ + asm volatile ( "1: mov %[sel], %%gs\n\t" + ".section .fixup, \"ax\", @progbits\n\t" + "2: mov %k[flat], %%gs\n\t" + " xor %[sel], %[sel]\n\t" + " jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(1b, 2b) + : [sel] "+r" (sel) + : [flat] "r" (FLAT_USER_DS32) ); + + /* Update the cache of the inactive base, as read from the GDT/LDT. */ + v->arch.pv.gs_base_user = rdgsbase(); + + asm volatile ( safe_swapgs ); break; + } default: ret = -EINVAL; ++++++ 5f560c42-x86-PV-64bit-segbase-consistency.patch ++++++ # Commit a5eaac9245f4f382a3cd0e9710e9d1cba7db20e4 # Date 2020-09-07 11:32:34 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Fix consistency of 64bit segment bases The comments in save_segments(), _toggle_guest_pt() and write_cr() are false. The %fs and %gs bases can be updated at any time by the guest. As a consequence, Xen's fs_base/etc tracking state is always stale when the vcpu is in context, and must not be used to complete MSR_{FS,GS}_BASE reads, etc. In particular, a sequence such as: wrmsr(MSR_FS_BASE, 0x1ull << 32); write_fs(__USER_DS); base = rdmsr(MSR_FS_BASE); will return the stale base, not the new base. This may cause guest a guest kernel's context switching of userspace to malfunction. Therefore: * Update save_segments(), _toggle_guest_pt() and read_msr() to always read the segment bases from hardware. * Update write_cr(), write_msr() and do_set_segment_base() to not not waste time caching data which is instantly going to become stale again. * Provide comments explaining when the tracking state is and isn't stale. This bug has been present for 14 years, but several bugfixes since have built on and extended the original flawed logic. Fixes: ba9adb737ba ("Apply stricter checking to RDMSR/WRMSR emulations.") Fixes: c42494acb2f ("x86: fix FS/GS base handling when using the fsgsbase feature") Fixed: eccc170053e ("x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1562,6 +1562,16 @@ static void load_segments(struct vcpu *n } } +/* + * Record all guest segment state. The guest can load segment selectors + * without trapping, which will also alter the 64bit FS/GS bases. Arbitrary + * changes to bases can also be made with the WR{FS,GS}BASE instructions, when + * enabled. + * + * Guests however cannot use SWAPGS, so there is no mechanism to modify the + * inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive + * GS base is still accurate, and doesn't need reading back from hardware. + */ static void save_segments(struct vcpu *v) { struct cpu_user_regs *regs = &v->arch.user_regs; @@ -1572,14 +1582,15 @@ static void save_segments(struct vcpu *v regs->fs = read_sreg(fs); regs->gs = read_sreg(gs); - /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ - if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) ) + if ( !is_pv_32bit_vcpu(v) ) { - v->arch.pv.fs_base = __rdfsbase(); + unsigned long gs_base = rdgsbase(); + + v->arch.pv.fs_base = rdfsbase(); if ( v->arch.flags & TF_kernel_mode ) - v->arch.pv.gs_base_kernel = __rdgsbase(); + v->arch.pv.gs_base_kernel = gs_base; else - v->arch.pv.gs_base_user = __rdgsbase(); + v->arch.pv.gs_base_user = gs_base; } if ( regs->ds ) --- a/xen/arch/x86/pv/domain.c +++ b/xen/arch/x86/pv/domain.c @@ -444,17 +444,19 @@ static void _toggle_guest_pt(struct vcpu void toggle_guest_mode(struct vcpu *v) { const struct domain *d = v->domain; + unsigned long gs_base; ASSERT(!is_pv_32bit_vcpu(v)); - /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ - if ( read_cr4() & X86_CR4_FSGSBASE ) - { - if ( v->arch.flags & TF_kernel_mode ) - v->arch.pv.gs_base_kernel = __rdgsbase(); - else - v->arch.pv.gs_base_user = __rdgsbase(); - } + /* + * Update the cached value of the GS base about to become inactive, as a + * subsequent context switch won't bother re-reading it. + */ + gs_base = rdgsbase(); + if ( v->arch.flags & TF_kernel_mode ) + v->arch.pv.gs_base_kernel = gs_base; + else + v->arch.pv.gs_base_user = gs_base; asm volatile ( "swapgs" ); _toggle_guest_pt(v); --- a/xen/arch/x86/pv/emul-priv-op.c +++ b/xen/arch/x86/pv/emul-priv-op.c @@ -801,17 +801,6 @@ static int write_cr(unsigned int reg, un } case 4: /* Write CR4 */ - /* - * If this write will disable FSGSBASE, refresh Xen's idea of the - * guest bases now that they can no longer change. - */ - if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) && - !(val & X86_CR4_FSGSBASE) ) - { - curr->arch.pv.fs_base = __rdfsbase(); - curr->arch.pv.gs_base_kernel = __rdgsbase(); - } - curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val); write_cr4(pv_make_cr4(curr)); ctxt_switch_levelling(curr); @@ -860,15 +849,13 @@ static int read_msr(unsigned int reg, ui case MSR_FS_BASE: if ( is_pv_32bit_domain(currd) ) break; - *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase() - : curr->arch.pv.fs_base; + *val = rdfsbase(); return X86EMUL_OKAY; case MSR_GS_BASE: if ( is_pv_32bit_domain(currd) ) break; - *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase() - : curr->arch.pv.gs_base_kernel; + *val = rdgsbase(); return X86EMUL_OKAY; case MSR_SHADOW_GS_BASE: @@ -997,14 +984,12 @@ static int write_msr(unsigned int reg, u if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) break; wrfsbase(val); - curr->arch.pv.fs_base = val; return X86EMUL_OKAY; case MSR_GS_BASE: if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) break; wrgsbase(val); - curr->arch.pv.gs_base_kernel = val; return X86EMUL_OKAY; case MSR_SHADOW_GS_BASE: --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -1002,10 +1002,7 @@ long do_set_segment_base(unsigned int wh { case SEGBASE_FS: if ( is_canonical_address(base) ) - { wrfsbase(base); - v->arch.pv.fs_base = base; - } else ret = -EINVAL; break; @@ -1022,10 +1019,7 @@ long do_set_segment_base(unsigned int wh case SEGBASE_GS_KERNEL: if ( is_canonical_address(base) ) - { wrgsbase(base); - v->arch.pv.gs_base_kernel = base; - } else ret = -EINVAL; break; --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -516,7 +516,24 @@ struct pv_vcpu bool_t syscall32_disables_events; bool_t sysenter_disables_events; - /* Segment base addresses. */ + /* + * 64bit segment bases. + * + * FS and the active GS are always stale when the vCPU is in context, as + * the guest can change them behind Xen's back with MOV SREG, or + * WR{FS,GS}BASE on capable hardware. + * + * The inactive GS base is never stale, as guests can't use SWAPGS to + * access it - all modification is performed by Xen either directly + * (hypercall, #GP emulation), or indirectly (toggle_guest_mode()). + * + * The vCPU context switch path is optimised based on this fact, so any + * path updating or swapping the inactive base must update the cached + * value as well. + * + * Which GS base is active and inactive depends on whether the vCPU is in + * user or kernel context. + */ unsigned long fs_base; unsigned long gs_base_kernel; unsigned long gs_base_user; ++++++ 5f560c42-x86-PV-rewrite-segment-ctxt-switch.patch ++++++ # Commit ad0fd291c5e79191c2e3c70e43dded569f11a450 # Date 2020-09-07 11:32:34 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Rewrite segment context switching from scratch There are multiple bugs with the existing implementation. On AMD CPUs prior to Zen2, loading a NUL segment selector doesn't clear the segment base, which is a problem for 64bit code which typically expects to use a NUL %fs/%gs selector. On a context switch from any PV vcpu, to a 64bit PV vcpu with an %fs/%gs selector which faults, the fixup logic loads NUL, and the guest is entered at the failsafe callback with the stale base. Alternatively, a PV context switch sequence of 64 (NUL, non-zero base) => 32 (NUL) => 64 (NUL, zero base) will similarly cause Xen to enter the guest with a stale base. Both of these corner cases manifest as state corruption in the final vcpu. However, damage is limited to to 64bit code expecting to use Thread Local Storage with a base pointer of 0, which doesn't occur by default. The context switch logic is extremely complicated, and is attempting to optimise away loading a NUL selector (which is fast), or writing a 64bit base of 0 (which is rare). Furthermore, it fails to respect Linux's ABI with userspace, which manifests as userspace state corruption as far as Linux is concerned. Always restore all selector and base state, in all cases. Leave a large comment explaining hardware behaviour, and the new ABI expectations. Update the comments in the public headers. Drop all "segment preloading" to handle the AMD corner case. It was never anything but a waste of time for %ds/%es, and isn't needed now that %fs/%gs bases are unconditionally written for 64bit PV guests. In load_segments(), store the result of is_pv_32bit_vcpu() as it is an expensive predicate now, and not used in a way which impacts speculative safety. Reported-by: Andy Lutomirski <luto@kernel.org> Reported-by: Sarah Newman <srn@prgmr.com> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> # Commit 1e2d3be2e516e6f415ca6029f651b76a8563a27c # Date 2020-09-08 16:46:31 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Fix assertions in svm_load_segs() OSSTest has shown an assertion failure: http://logs.test-lab.xenproject.org/osstest/logs/153906/test-xtf-amd64-amd64... This is because we pass a non-NUL selector into svm_load_segs(), which is something we must not do, as this path does not load the attributes/limits from the GDT/LDT. Drop the {fs,gs}_sel parameters from svm_load_segs() and use 0 instead. This is acceptable even for non-zero NUL segments, as it is how the IRET instruction behaves in all CPUs. Only use the svm_load_segs() path when both FS and GS are NUL, which is the common case when scheduling a 64bit vcpu with 64bit userspace in context. Fixes: ad0fd291c5 ("x86/pv: Rewrite segment context switching from scratch") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1017,13 +1017,9 @@ int arch_set_info_guest( if ( !compat ) { v->arch.pv.syscall_callback_eip = c.nat->syscall_callback_eip; - /* non-nul selector kills fs_base */ - v->arch.pv.fs_base = - !(v->arch.user_regs.fs & ~3) ? c.nat->fs_base : 0; + v->arch.pv.fs_base = c.nat->fs_base; v->arch.pv.gs_base_kernel = c.nat->gs_base_kernel; - /* non-nul selector kills gs_base_user */ - v->arch.pv.gs_base_user = - !(v->arch.user_regs.gs & ~3) ? c.nat->gs_base_user : 0; + v->arch.pv.gs_base_user = c.nat->gs_base_user; } else { @@ -1342,58 +1338,60 @@ arch_do_vcpu_op( } /* - * Loading a nul selector does not clear bases and limits on AMD or Hygon - * CPUs. Be on the safe side and re-initialize both to flat segment values - * before loading a nul selector. - */ -#define preload_segment(seg, value) do { \ - if ( !((value) & ~3) && \ - (boot_cpu_data.x86_vendor & \ - (X86_VENDOR_AMD | X86_VENDOR_HYGON)) ) \ - asm volatile ( "movl %k0, %%" #seg \ - :: "r" (FLAT_USER_DS32) ); \ -} while ( false ) - -#define loadsegment(seg,value) ({ \ - int __r = 1; \ - asm volatile ( \ - "1: movl %k1,%%" #seg "\n2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: xorl %k0,%k0\n" \ - " movl %k0,%%" #seg "\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : "=r" (__r) : "r" (value), "0" (__r) );\ - __r; }) - -/* - * save_segments() writes a mask of segments which are dirty (non-zero), - * allowing load_segments() to avoid some expensive segment loads and - * MSR writes. + * Notes on PV segment handling: + * - 32bit: All data from the GDT/LDT. + * - 64bit: In addition, 64bit FS/GS/GS_KERN bases. + * + * Linux's ABI with userspace expects to preserve the full selector and + * segment base, even sel != NUL, base != GDT/LDT for 64bit code. Xen must + * honour this when context switching, to avoid breaking Linux's ABI. + * + * Note: It is impossible to preserve a selector value of 1, 2 or 3, as these + * get reset to 0 by an IRET back to guest context. Code playing with + * arcane corners of x86 get to keep all resulting pieces. + * + * Therefore, we: + * - Load the LDT. + * - Load each segment selector. + * - Any error loads zero, and triggers a failsafe callback. + * - For 64bit, further load the 64bit bases. + * + * An optimisation exists on SVM-capable hardware, where we use a VMLOAD + * instruction to load the LDT and full FS/GS/GS_KERN data in one go. + * + * AMD-like CPUs prior to Zen2 do not zero the segment base or limit when + * loading a NUL selector. This is a problem in principle when context + * switching to a 64bit guest, as a NUL FS/GS segment is usable and will pick + * up the stale base. + * + * However, it is not an issue in practice. NUL segments are unusable for + * 32bit guests (so any stale base won't be used), and we unconditionally + * write the full FS/GS bases for 64bit guests. */ -static DEFINE_PER_CPU(unsigned int, dirty_segment_mask); -#define DIRTY_DS 0x01 -#define DIRTY_ES 0x02 -#define DIRTY_FS 0x04 -#define DIRTY_GS 0x08 -#define DIRTY_FS_BASE 0x10 -#define DIRTY_GS_BASE 0x20 - static void load_segments(struct vcpu *n) { struct cpu_user_regs *uregs = &n->arch.user_regs; - int all_segs_okay = 1; - unsigned int dirty_segment_mask, cpu = smp_processor_id(); - bool fs_gs_done = false; - - /* Load and clear the dirty segment mask. */ - dirty_segment_mask = per_cpu(dirty_segment_mask, cpu); - per_cpu(dirty_segment_mask, cpu) = 0; + bool compat = is_pv_32bit_vcpu(n); + bool all_segs_okay = true, fs_gs_done = false; + + /* + * Attempt to load @seg with selector @val. On error, clear + * @all_segs_okay in function scope, and load NUL into @sel. + */ +#define TRY_LOAD_SEG(seg, val) \ + asm volatile ( "1: mov %k[_val], %%" #seg "\n\t" \ + "2:\n\t" \ + ".section .fixup, \"ax\"\n\t" \ + "3: xor %k[ok], %k[ok]\n\t" \ + " mov %k[ok], %%" #seg "\n\t" \ + " jmp 2b\n\t" \ + ".previous\n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : [ok] "+r" (all_segs_okay) \ + : [_val] "rm" (val) ) #ifdef CONFIG_HVM - if ( cpu_has_svm && !is_pv_32bit_vcpu(n) && - !(read_cr4() & X86_CR4_FSGSBASE) && !((uregs->fs | uregs->gs) & ~3) ) + if ( cpu_has_svm && !compat && (uregs->fs | uregs->gs) <= 3 ) { unsigned long gsb = n->arch.flags & TF_kernel_mode ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user; @@ -1401,62 +1399,25 @@ static void load_segments(struct vcpu *n ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel; fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n), - uregs->fs, n->arch.pv.fs_base, - uregs->gs, gsb, gss); + n->arch.pv.fs_base, gsb, gss); } #endif if ( !fs_gs_done ) - load_LDT(n); - - /* Either selector != 0 ==> reload. */ - if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) ) - { - preload_segment(ds, uregs->ds); - all_segs_okay &= loadsegment(ds, uregs->ds); - } - - /* Either selector != 0 ==> reload. */ - if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) ) { - preload_segment(es, uregs->es); - all_segs_okay &= loadsegment(es, uregs->es); - } + load_LDT(n); - /* Either selector != 0 ==> reload. */ - if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done ) - { - all_segs_okay &= loadsegment(fs, uregs->fs); - /* non-nul selector updates fs_base */ - if ( uregs->fs & ~3 ) - dirty_segment_mask &= ~DIRTY_FS_BASE; + TRY_LOAD_SEG(fs, uregs->fs); + TRY_LOAD_SEG(gs, uregs->gs); } - /* Either selector != 0 ==> reload. */ - if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done ) - { - all_segs_okay &= loadsegment(gs, uregs->gs); - /* non-nul selector updates gs_base_user */ - if ( uregs->gs & ~3 ) - dirty_segment_mask &= ~DIRTY_GS_BASE; - } + TRY_LOAD_SEG(ds, uregs->ds); + TRY_LOAD_SEG(es, uregs->es); - if ( !fs_gs_done && !is_pv_32bit_vcpu(n) ) + if ( !fs_gs_done && !compat ) { - /* This can only be non-zero if selector is NULL. */ - if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) ) - wrfsbase(n->arch.pv.fs_base); - - /* - * Most kernels have non-zero GS base, so don't bother testing. - * (For old AMD hardware this is also a serialising instruction, - * avoiding erratum #88.) - */ + wrfsbase(n->arch.pv.fs_base); wrgsshadow(n->arch.pv.gs_base_kernel); - - /* This can only be non-zero if selector is NULL. */ - if ( n->arch.pv.gs_base_user | - (dirty_segment_mask & DIRTY_GS_BASE) ) - wrgsbase(n->arch.pv.gs_base_user); + wrgsbase(n->arch.pv.gs_base_user); /* If in kernel mode then switch the GS bases around. */ if ( (n->arch.flags & TF_kernel_mode) ) @@ -1575,7 +1536,6 @@ static void load_segments(struct vcpu *n static void save_segments(struct vcpu *v) { struct cpu_user_regs *regs = &v->arch.user_regs; - unsigned int dirty_segment_mask = 0; regs->ds = read_sreg(ds); regs->es = read_sreg(es); @@ -1592,35 +1552,6 @@ static void save_segments(struct vcpu *v else v->arch.pv.gs_base_user = gs_base; } - - if ( regs->ds ) - dirty_segment_mask |= DIRTY_DS; - - if ( regs->es ) - dirty_segment_mask |= DIRTY_ES; - - if ( regs->fs || is_pv_32bit_vcpu(v) ) - { - dirty_segment_mask |= DIRTY_FS; - /* non-nul selector kills fs_base */ - if ( regs->fs & ~3 ) - v->arch.pv.fs_base = 0; - } - if ( v->arch.pv.fs_base ) - dirty_segment_mask |= DIRTY_FS_BASE; - - if ( regs->gs || is_pv_32bit_vcpu(v) ) - { - dirty_segment_mask |= DIRTY_GS; - /* non-nul selector kills gs_base_user */ - if ( regs->gs & ~3 ) - v->arch.pv.gs_base_user = 0; - } - if ( v->arch.flags & TF_kernel_mode ? v->arch.pv.gs_base_kernel - : v->arch.pv.gs_base_user ) - dirty_segment_mask |= DIRTY_GS_BASE; - - this_cpu(dirty_segment_mask) = dirty_segment_mask; } void paravirt_ctxt_switch_from(struct vcpu *v) @@ -1830,8 +1761,8 @@ static void __context_switch(void) #if defined(CONFIG_PV) && defined(CONFIG_HVM) /* Prefetch the VMCB if we expect to use it later in the context switch */ if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) && - !is_idle_domain(nd) && !(read_cr4() & X86_CR4_FSGSBASE) ) - svm_load_segs(0, 0, 0, 0, 0, 0, 0); + !is_idle_domain(nd) ) + svm_load_segs(0, 0, 0, 0, 0); #endif if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) ) --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -1518,8 +1518,7 @@ static void svm_init_erratum_383(const s #ifdef CONFIG_PV bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base, - unsigned int fs_sel, unsigned long fs_base, - unsigned int gs_sel, unsigned long gs_base, + unsigned long fs_base, unsigned long gs_base, unsigned long gs_shadow) { unsigned int cpu = smp_processor_id(); @@ -1556,14 +1555,12 @@ bool svm_load_segs(unsigned int ldt_ents vmcb->ldtr.base = ldt_base; } - ASSERT(!(fs_sel & ~3)); - vmcb->fs.sel = fs_sel; + vmcb->fs.sel = 0; vmcb->fs.attr = 0; vmcb->fs.limit = 0; vmcb->fs.base = fs_base; - ASSERT(!(gs_sel & ~3)); - vmcb->gs.sel = gs_sel; + vmcb->gs.sel = 0; vmcb->gs.attr = 0; vmcb->gs.limit = 0; vmcb->gs.base = gs_base; --- a/xen/include/asm-x86/hvm/svm/svm.h +++ b/xen/include/asm-x86/hvm/svm/svm.h @@ -52,10 +52,12 @@ void svm_update_guest_cr(struct vcpu *, /* * PV context switch helper. Calls with zero ldt_base request a prefetch of * the VMCB area to be loaded from, instead of an actual load of state. + * + * Must only be used for NUL FS/GS, as the segment attributes/limits are not + * read from the GDT/LDT. */ bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base, - unsigned int fs_sel, unsigned long fs_base, - unsigned int gs_sel, unsigned long gs_base, + unsigned long fs_base, unsigned long gs_base, unsigned long gs_shadow); extern u32 svm_feature_flags; --- a/xen/include/public/arch-x86/xen-x86_64.h +++ b/xen/include/public/arch-x86/xen-x86_64.h @@ -203,8 +203,8 @@ struct cpu_user_regs { uint16_t ss, _pad2[3]; uint16_t es, _pad3[3]; uint16_t ds, _pad4[3]; - uint16_t fs, _pad5[3]; /* Non-nul => takes precedence over fs_base. */ - uint16_t gs, _pad6[3]; /* Non-nul => takes precedence over gs_base_user. */ + uint16_t fs, _pad5[3]; + uint16_t gs, _pad6[3]; }; typedef struct cpu_user_regs cpu_user_regs_t; DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); ++++++ 5f5b6b7a-hypfs-fix-custom-param-writes.patch ++++++ # Commit b4e41b1750d550bf2b1ccf97ee46f4f682bdbb62 # Date 2020-09-11 14:20:10 +0200 # Author Juergen Gross <jgross@suse.com> # Committer Jan Beulich <jbeulich@suse.com> xen/hypfs: fix writing of custom parameter Today the maximum allowed data length for writing a hypfs node is tested in the generic hypfs_write() function. For custom runtime parameters this might be wrong, as the maximum allowed size is derived from the buffer holding the current setting, while there might be ways to set the parameter needing more characters than the minimal representation of that value. One example for this is the "ept" parameter. Its value buffer is sized to be able to hold the string "exec-sp=0" or "exec-sp=1", while it is allowed to use e.g. "no-exec-sp" or "exec-sp=yes" for setting it. Fix that by moving the length check one level down to the type specific write function. In order to avoid allocation of arbitrary sized buffers use a new MAX_PARAM_SIZE macro as an upper limit for custom writes. The value of MAX_PARAM_SIZE is the same as the limit in parse_params() for a single parameter. Fixes: a659d7cab9af ("xen: add runtime parameter access support to hypfs") Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/common/hypfs.c +++ b/xen/common/hypfs.c @@ -297,7 +297,9 @@ int hypfs_write_leaf(struct hypfs_entry_ int ret; ASSERT(this_cpu(hypfs_locked) == hypfs_write_locked); - ASSERT(ulen <= leaf->e.max_size); + + if ( ulen > leaf->e.max_size ) + return -ENOSPC; if ( leaf->e.type != XEN_HYPFS_TYPE_STRING && leaf->e.type != XEN_HYPFS_TYPE_BLOB && ulen != leaf->e.size ) @@ -356,6 +358,10 @@ int hypfs_write_custom(struct hypfs_entr ASSERT(this_cpu(hypfs_locked) == hypfs_write_locked); + /* Avoid oversized buffer allocation. */ + if ( ulen > MAX_PARAM_SIZE ) + return -ENOSPC; + buf = xzalloc_array(char, ulen); if ( !buf ) return -ENOMEM; @@ -386,9 +392,6 @@ static int hypfs_write(struct hypfs_entr ASSERT(entry->max_size); - if ( ulen > entry->max_size ) - return -ENOSPC; - l = container_of(entry, struct hypfs_entry_leaf, e); return entry->write(l, uaddr, ulen); --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -57,7 +57,7 @@ static int assign_integer_param(const st static int parse_params(const char *cmdline, const struct kernel_param *start, const struct kernel_param *end) { - char opt[128], *optval, *optkey, *q; + char opt[MAX_PARAM_SIZE], *optval, *optkey, *q; const char *p = cmdline, *key; const struct kernel_param *param; int rc, final_rc = 0; --- a/xen/include/xen/param.h +++ b/xen/include/xen/param.h @@ -26,6 +26,9 @@ struct kernel_param { } par; }; +/* Maximum length of a single parameter string. */ +#define MAX_PARAM_SIZE 128 + extern const struct kernel_param __setup_start[], __setup_end[]; #define __param(att) static const att \ ++++++ 5f607915-x86-HVM-more-consistent-IO-completion.patch ++++++ # Commit b807cfe954b8d0d8852398b4c8a586d95d69a342 # Date 2020-09-15 10:19:33 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> x86/HVM: more consistently set I/O completion Doing this just in hvm_emulate_one_insn() is not enough. hvm_ud_intercept() and hvm_emulate_one_vm_event() can get invoked for insns requiring one or more continuations, and at least in principle hvm_emulate_one_mmio() could, too. Without proper setting of the field, handle_hvm_io_completion() will do nothing completion-wise, and in particular the missing re-invocation of the insn emulation paths will lead to emulation caching not getting disabled in due course, causing the ASSERT() in {svm,vmx}_vmenter_helper() to trigger. Reported-by: Don Slutz <don.slutz@gmail.com> Similar considerations go for the clearing of vio->mmio_access, which gets moved as well. Additionally all updating of vio->mmio_* now gets done dependent upon the new completion value, rather than hvm_ioreq_needs_completion()'s return value. This is because it is the completion chosen which controls what path will be taken when handling the completion, not the simple boolean return value. In particular, PIO completion doesn't involve going through the insn emulator, and hence emulator state ought to get cleared early (or it won't get cleared at all). The new logic, besides allowing for a caller override for the continuation type to be set (for VMX real mode emulation), will also avoid setting an MMIO completion when a simpler PIO one will do. This is a minor optimization only as a side effect - the behavior is strictly needed at least for hvm_ud_intercept(), as only memory accesses can successfully complete through handle_mmio(). Care of course needs to be taken to correctly deal with "mixed" insns (doing both MMIO and PIO at the same time, i.e. INS/OUTS). For this, hvmemul_validate() now latches whether the insn being emulated is a memory access, as this information is no longer easily available at the point where we want to consume it. Note that the presence of non-NULL .validate fields in the two ops structures in hvm_emulate_one_mmio() was really necessary even before the changes here: Without this, passing non-NULL as middle argument to hvm_emulate_init_once() is meaningless. The restrictions on when the #UD intercept gets actually enabled are why it was decided that this is not a security issue: - the "hvm_fep" option to enable its use is a debugging option only, - for the cross-vendor case is considered experimental, even if unfortunately SUPPORT.md doesn't have an explicit statement about this. The other two affected functions are - hvm_emulate_one_vm_event(), used for introspection, - hvm_emulate_one_mmio(), used for Dom0 only, which aren't qualifying this as needing an XSA either. Signed-off-by: Jan Beulich <jbeulich@suse.com> Tested-by: Don Slutz <don.slutz@gmail.com> Reviewed-by: Paul Durrant <paul@xen.org> Reviewed-by: Kevin Tian <kevin.tian@intel.com> --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -1683,9 +1683,11 @@ static int hvmemul_validate( const struct x86_emulate_state *state, struct x86_emulate_ctxt *ctxt) { - const struct hvm_emulate_ctxt *hvmemul_ctxt = + struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + hvmemul_ctxt->is_mem_access = x86_insn_is_mem_access(state, ctxt); + return !hvmemul_ctxt->validate || hvmemul_ctxt->validate(state, ctxt) ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; } @@ -2610,8 +2612,14 @@ static const struct x86_emulate_ops hvm_ .vmfunc = hvmemul_vmfunc, }; +/* + * Note that passing HVMIO_no_completion into this function serves as kind + * of (but not fully) an "auto select completion" indicator. When there's + * no completion needed, the passed in value will be ignored in any case. + */ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, - const struct x86_emulate_ops *ops) + const struct x86_emulate_ops *ops, + enum hvm_io_completion completion) { const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; struct vcpu *curr = current; @@ -2642,16 +2650,31 @@ static int _hvm_emulate_one(struct hvm_e rc = X86EMUL_RETRY; if ( !hvm_ioreq_needs_completion(&vio->io_req) ) + completion = HVMIO_no_completion; + else if ( completion == HVMIO_no_completion ) + completion = (vio->io_req.type != IOREQ_TYPE_PIO || + hvmemul_ctxt->is_mem_access) ? HVMIO_mmio_completion + : HVMIO_pio_completion; + + switch ( vio->io_completion = completion ) { + case HVMIO_no_completion: + case HVMIO_pio_completion: vio->mmio_cache_count = 0; vio->mmio_insn_bytes = 0; + vio->mmio_access = (struct npfec){}; hvmemul_cache_disable(curr); - } - else - { + break; + + case HVMIO_mmio_completion: + case HVMIO_realmode_completion: BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); + break; + + default: + ASSERT_UNREACHABLE(); } if ( hvmemul_ctxt->ctxt.retire.singlestep ) @@ -2692,9 +2715,10 @@ static int _hvm_emulate_one(struct hvm_e } int hvm_emulate_one( - struct hvm_emulate_ctxt *hvmemul_ctxt) + struct hvm_emulate_ctxt *hvmemul_ctxt, + enum hvm_io_completion completion) { - return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops); + return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops, completion); } int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla) @@ -2703,11 +2727,13 @@ int hvm_emulate_one_mmio(unsigned long m .read = x86emul_unhandleable_rw, .insn_fetch = hvmemul_insn_fetch, .write = mmcfg_intercept_write, + .validate = hvmemul_validate, }; static const struct x86_emulate_ops hvm_ro_emulate_ops_mmio = { .read = x86emul_unhandleable_rw, .insn_fetch = hvmemul_insn_fetch, .write = mmio_ro_emulated_write, + .validate = hvmemul_validate, }; struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla }; struct hvm_emulate_ctxt ctxt; @@ -2727,8 +2753,8 @@ int hvm_emulate_one_mmio(unsigned long m hvm_emulate_init_once(&ctxt, x86_insn_is_mem_write, guest_cpu_user_regs()); ctxt.ctxt.data = &mmio_ro_ctxt; - rc = _hvm_emulate_one(&ctxt, ops); - switch ( rc ) + + switch ( rc = _hvm_emulate_one(&ctxt, ops, HVMIO_no_completion) ) { case X86EMUL_UNHANDLEABLE: case X86EMUL_UNIMPLEMENTED: @@ -2755,7 +2781,8 @@ void hvm_emulate_one_vm_event(enum emul_ switch ( kind ) { case EMUL_KIND_NOWRITE: - rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write); + rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write, + HVMIO_no_completion); break; case EMUL_KIND_SET_CONTEXT_INSN: { struct vcpu *curr = current; @@ -2776,7 +2803,7 @@ void hvm_emulate_one_vm_event(enum emul_ /* Fall-through */ default: ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA); - rc = hvm_emulate_one(&ctx); + rc = hvm_emulate_one(&ctx, HVMIO_no_completion); } switch ( rc ) @@ -2874,6 +2901,8 @@ void hvm_emulate_init_per_insn( pfec, NULL) == HVMTRANS_okay) ? sizeof(hvmemul_ctxt->insn_buf) : 0; } + + hvmemul_ctxt->is_mem_access = false; } void hvm_emulate_writeback( --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -3810,7 +3810,7 @@ void hvm_ud_intercept(struct cpu_user_re return; } - switch ( hvm_emulate_one(&ctxt) ) + switch ( hvm_emulate_one(&ctxt, HVMIO_no_completion) ) { case X86EMUL_UNHANDLEABLE: case X86EMUL_UNIMPLEMENTED: --- a/xen/arch/x86/hvm/io.c +++ b/xen/arch/x86/hvm/io.c @@ -81,20 +81,11 @@ void send_invalidate_req(void) bool hvm_emulate_one_insn(hvm_emulate_validate_t *validate, const char *descr) { struct hvm_emulate_ctxt ctxt; - struct vcpu *curr = current; - struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io; int rc; hvm_emulate_init_once(&ctxt, validate, guest_cpu_user_regs()); - rc = hvm_emulate_one(&ctxt); - - if ( hvm_ioreq_needs_completion(&vio->io_req) ) - vio->io_completion = HVMIO_mmio_completion; - else - vio->mmio_access = (struct npfec){}; - - switch ( rc ) + switch ( rc = hvm_emulate_one(&ctxt, HVMIO_no_completion) ) { case X86EMUL_UNHANDLEABLE: hvm_dump_emulation_state(XENLOG_G_WARNING, descr, &ctxt, rc); --- a/xen/arch/x86/hvm/vmx/realmode.c +++ b/xen/arch/x86/hvm/vmx/realmode.c @@ -97,15 +97,11 @@ static void realmode_deliver_exception( void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; - struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io; int rc; perfc_incr(realmode_emulations); - rc = hvm_emulate_one(hvmemul_ctxt); - - if ( hvm_ioreq_needs_completion(&vio->io_req) ) - vio->io_completion = HVMIO_realmode_completion; + rc = hvm_emulate_one(hvmemul_ctxt, HVMIO_realmode_completion); if ( rc == X86EMUL_UNHANDLEABLE ) { --- a/xen/include/asm-x86/hvm/emulate.h +++ b/xen/include/asm-x86/hvm/emulate.h @@ -48,6 +48,8 @@ struct hvm_emulate_ctxt { uint32_t intr_shadow; + bool is_mem_access; + bool_t set_context; }; @@ -62,7 +64,8 @@ bool __nonnull(1, 2) hvm_emulate_one_ins hvm_emulate_validate_t *validate, const char *descr); int hvm_emulate_one( - struct hvm_emulate_ctxt *hvmemul_ctxt); + struct hvm_emulate_ctxt *hvmemul_ctxt, + enum hvm_io_completion completion); void hvm_emulate_one_vm_event(enum emul_kind kind, unsigned int trapnr, unsigned int errcode); ++++++ 5f6a002d-x86-PV-handle-MSR_MISC_ENABLE-correctly.patch ++++++ # Commit e71301ecd50f2d3bd1b960bbf7dcf850d02e7e8a # Date 2020-09-22 15:46:21 +0200 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly This MSR doesn't exist on AMD hardware, and switching away from the safe functions in the common MSR path was an erroneous change. Partially revert the change. This is XSA-333. Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Wei Liu <wl@xen.org> --- a/xen/arch/x86/pv/emul-priv-op.c +++ b/xen/arch/x86/pv/emul-priv-op.c @@ -913,7 +913,8 @@ static int read_msr(unsigned int reg, ui return X86EMUL_OKAY; case MSR_IA32_MISC_ENABLE: - rdmsrl(reg, *val); + if ( rdmsr_safe(reg, *val) ) + break; *val = guest_misc_enable(*val); return X86EMUL_OKAY; @@ -1053,7 +1054,8 @@ static int write_msr(unsigned int reg, u break; case MSR_IA32_MISC_ENABLE: - rdmsrl(reg, temp); + if ( rdmsr_safe(reg, temp) ) + break; if ( val != guest_misc_enable(temp) ) goto invalid; return X86EMUL_OKAY; ++++++ 5f6a0049-memory-dont-skip-RCU-unlock-in-acquire_resource.patch ++++++ # Commit 42317dede5be4fe5be27c873f7a3f94d3bba98e0 # Date 2020-09-22 15:46:49 +0200 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> xen/memory: Don't skip the RCU unlock path in acquire_resource() In the case that an HVM Stubdomain makes an XENMEM_acquire_resource hypercall, the FIXME path will bypass rcu_unlock_domain() on the way out of the function. Move the check to the start of the function. This does change the behaviour of the get-size path for HVM Stubdomains, but that functionality is currently broken and unused anyway, as well as being quite useless to entities which can't actually map the resource anyway. This is XSA-334. Fixes: 83fa6552ce ("common: add a new mappable resource type: XENMEM_resource_grant_table") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -1058,6 +1058,14 @@ static int acquire_resource( xen_pfn_t mfn_list[32]; int rc; + /* + * FIXME: Until foreign pages inserted into the P2M are properly + * reference counted, it is unsafe to allow mapping of + * resource pages unless the caller is the hardware domain. + */ + if ( paging_mode_translate(currd) && !is_hardware_domain(currd) ) + return -EACCES; + if ( copy_from_guest(&xmar, arg, 1) ) return -EFAULT; @@ -1114,14 +1122,6 @@ static int acquire_resource( xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)]; unsigned int i; - /* - * FIXME: Until foreign pages inserted into the P2M are properly - * reference counted, it is unsafe to allow mapping of - * resource pages unless the caller is the hardware domain. - */ - if ( !is_hardware_domain(currd) ) - return -EACCES; - if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) ) rc = -EFAULT; ++++++ 5f6a0067-x86-vPT-fix-race-when-migrating-timers.patch ++++++ # Commit 8e76aef72820435e766c7f339ed36da33da90c40 # Date 2020-09-22 15:47:19 +0200 # Author Roger Pau Monné <roger.pau@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/vpt: fix race when migrating timers between vCPUs The current vPT code will migrate the emulated timers between vCPUs (change the pt->vcpu field) while just holding the destination lock, either from create_periodic_time or pt_adjust_global_vcpu_target if the global target is adjusted. Changing the periodic_timer vCPU field in this way creates a race where a third party could grab the lock in the unlocked region of pt_adjust_global_vcpu_target (or before create_periodic_time performs the vcpu change) and then release the lock from a different vCPU, creating a locking imbalance. Introduce a per-domain rwlock in order to protect periodic_time migration between vCPU lists. Taking the lock in read mode prevents any timer from being migrated to a different vCPU, while taking it in write mode allows performing migration of timers across vCPUs. The per-vcpu locks are still used to protect all the other fields from the periodic_timer struct. Note that such migration shouldn't happen frequently, and hence there's no performance drop as a result of such locking. This is XSA-336. Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com> Tested-by: Igor Druzhinin <igor.druzhinin@citrix.com> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain /* need link to containing domain */ d->arch.hvm.pl_time->domain = d; + rwlock_init(&d->arch.hvm.pl_time->pt_migrate); + /* Set the default IO Bitmap. */ if ( is_hardware_domain(d) ) { --- a/xen/arch/x86/hvm/vpt.c +++ b/xen/arch/x86/hvm/vpt.c @@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic return 1; } -static void pt_lock(struct periodic_time *pt) +static void pt_vcpu_lock(struct vcpu *v) { - struct vcpu *v; + read_lock(&v->domain->arch.hvm.pl_time->pt_migrate); + spin_lock(&v->arch.hvm.tm_lock); +} - for ( ; ; ) - { - v = pt->vcpu; - spin_lock(&v->arch.hvm.tm_lock); - if ( likely(pt->vcpu == v) ) - break; - spin_unlock(&v->arch.hvm.tm_lock); - } +static void pt_vcpu_unlock(struct vcpu *v) +{ + spin_unlock(&v->arch.hvm.tm_lock); + read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); +} + +static void pt_lock(struct periodic_time *pt) +{ + /* + * We cannot use pt_vcpu_lock here, because we need to acquire the + * per-domain lock first and then (re-)fetch the value of pt->vcpu, or + * else we might be using a stale value of pt->vcpu. + */ + read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + spin_lock(&pt->vcpu->arch.hvm.tm_lock); } static void pt_unlock(struct periodic_time *pt) { - spin_unlock(&pt->vcpu->arch.hvm.tm_lock); + pt_vcpu_unlock(pt->vcpu); } static void pt_process_missed_ticks(struct periodic_time *pt) @@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v) if ( v->pause_flags & VPF_blocked ) return; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); list_for_each_entry ( pt, head, list ) if ( !pt->do_not_freeze ) @@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v) pt_freeze_time(v); - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); } void pt_restore_timer(struct vcpu *v) @@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v) struct list_head *head = &v->arch.hvm.tm_list; struct periodic_time *pt; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); list_for_each_entry ( pt, head, list ) { @@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v) pt_thaw_time(v); - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); } static void pt_timer_fn(void *data) @@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v) int irq, pt_vector = -1; bool level; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); earliest_pt = NULL; max_lag = -1ULL; @@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v) if ( earliest_pt == NULL ) { - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); return -1; } @@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v) irq = earliest_pt->irq; level = earliest_pt->level; - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); switch ( earliest_pt->source ) { @@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v) time_cb *cb = NULL; void *cb_priv; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); /* Make sure the timer is still on the list. */ list_for_each_entry ( pt, &v->arch.hvm.tm_list, list ) if ( pt == earliest_pt ) @@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v) cb_priv = pt->priv; break; } - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); if ( cb != NULL ) cb(v, cb_priv); @@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct if ( intack.source == hvm_intsrc_vector ) return; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); pt = is_pt_irq(v, intack); if ( pt == NULL ) { - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); return; } @@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct cb = pt->cb; cb_priv = pt->priv; - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); if ( cb != NULL ) cb(v, cb_priv); @@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v) struct list_head *head = &v->arch.hvm.tm_list; struct periodic_time *pt; - spin_lock(&v->arch.hvm.tm_lock); + pt_vcpu_lock(v); list_for_each_entry ( pt, head, list ) migrate_timer(&pt->timer, v->processor); - spin_unlock(&v->arch.hvm.tm_lock); + pt_vcpu_unlock(v); } void create_periodic_time( @@ -490,7 +499,7 @@ void create_periodic_time( destroy_periodic_time(pt); - spin_lock(&v->arch.hvm.tm_lock); + write_lock(&v->domain->arch.hvm.pl_time->pt_migrate); pt->pending_intr_nr = 0; pt->do_not_freeze = 0; @@ -540,7 +549,7 @@ void create_periodic_time( init_timer(&pt->timer, pt_timer_fn, pt, v->processor); set_timer(&pt->timer, pt->scheduled); - spin_unlock(&v->arch.hvm.tm_lock); + write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); } void destroy_periodic_time(struct periodic_time *pt) @@ -565,30 +574,20 @@ void destroy_periodic_time(struct period static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) { - int on_list; - ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); if ( pt->vcpu == NULL ) return; - pt_lock(pt); - on_list = pt->on_list; - if ( pt->on_list ) - list_del(&pt->list); - pt->on_list = 0; - pt_unlock(pt); - - spin_lock(&v->arch.hvm.tm_lock); + write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); pt->vcpu = v; - if ( on_list ) + if ( pt->on_list ) { - pt->on_list = 1; + list_del(&pt->list); list_add(&pt->list, &v->arch.hvm.tm_list); - migrate_timer(&pt->timer, v->processor); } - spin_unlock(&v->arch.hvm.tm_lock); + write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); } void pt_adjust_global_vcpu_target(struct vcpu *v) --- a/xen/include/asm-x86/hvm/vpt.h +++ b/xen/include/asm-x86/hvm/vpt.h @@ -128,6 +128,13 @@ struct pl_time { /* platform time */ struct RTCState vrtc; struct HPETState vhpet; struct PMTState vpmt; + /* + * rwlock to prevent periodic_time vCPU migration. Take the lock in read + * mode in order to prevent the vcpu field of periodic_time from changing. + * Lock must be taken in write mode when changes to the vcpu field are + * performed, as it allows exclusive access to all the timers of a domain. + */ + rwlock_t pt_migrate; /* guest_time = Xen sys time + stime_offset */ int64_t stime_offset; /* Ensures monotonicity in appropriate timer modes. */ ++++++ 5f6a008e-x86-MSI-drop-read_msi_msg.patch ++++++ # Commit cb5e9730862ba0c99e81d3fbd8f65010707245e4 # Date 2020-09-22 15:47:58 +0200 # Author Roger Pau Monné <roger.pau@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/msi: get rid of read_msi_msg It's safer and faster to just use the cached last written (untranslated) MSI message stored in msi_desc for the single user that calls read_msi_msg. This also prevents relying on the data read from the device MSI registers in order to figure out the index into the IOMMU interrupt remapping table, which is not safe. This is part of XSA-337. Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> Requested-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -185,54 +185,6 @@ void msi_compose_msg(unsigned vector, co MSI_DATA_VECTOR(vector); } -static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) -{ - switch ( entry->msi_attrib.type ) - { - case PCI_CAP_ID_MSI: - { - struct pci_dev *dev = entry->dev; - int pos = entry->msi_attrib.pos; - uint16_t data; - - msg->address_lo = pci_conf_read32(dev->sbdf, - msi_lower_address_reg(pos)); - if ( entry->msi_attrib.is_64 ) - { - msg->address_hi = pci_conf_read32(dev->sbdf, - msi_upper_address_reg(pos)); - data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 1)); - } - else - { - msg->address_hi = 0; - data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 0)); - } - msg->data = data; - break; - } - case PCI_CAP_ID_MSIX: - { - void __iomem *base = entry->mask_base; - - if ( unlikely(!msix_memory_decoded(entry->dev, - entry->msi_attrib.pos)) ) - return false; - msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); - break; - } - default: - BUG(); - } - - if ( iommu_intremap ) - iommu_read_msi_from_ire(entry, msg); - - return true; -} - static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { entry->msg = *msg; @@ -304,10 +256,7 @@ void set_msi_affinity(struct irq_desc *d ASSERT(spin_is_locked(&desc->lock)); - memset(&msg, 0, sizeof(msg)); - if ( !read_msi_msg(msi_desc, &msg) ) - return; - + msg = msi_desc->msg; msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(desc->arch.vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++++++ 5f6a00aa-x86-MSI-X-restrict-reading-of-PBA-bases.patch ++++++ # Commit beb54596cfdaf15f6a86d7b1bf84ca8a0b9c6b9b # Date 2020-09-22 15:48:26 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> x86/MSI-X: restrict reading of table/PBA bases from BARs When assigned to less trusted or un-trusted guests, devices may change state behind our backs (they may e.g. get reset by means we may not know about). Therefore we should avoid reading BARs from hardware once a device is no longer owned by Dom0. Furthermore when we can't read a BAR, or when we read zero, we shouldn't instead use the caller provided address unless that caller can be trusted. Re-arrange the logic in msix_capability_init() such that only Dom0 (and only if the device isn't DomU-owned yet) or calls through PHYSDEVOP_prepare_msix will actually result in the reading of the respective BAR register(s). Additionally do so only as long as in-use table entries are known (note that invocation of PHYSDEVOP_prepare_msix counts as a "pseudo" entry). In all other uses the value already recorded will get used instead. Clear the recorded values in _pci_cleanup_msix() as well as on the one affected error path. (Adjust this error path to also avoid blindly disabling MSI-X when it was enabled on entry to the function.) While moving around variable declarations (in many cases to reduce their scopes), also adjust some of their types. This is part of XSA-337. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -771,16 +771,14 @@ static int msix_capability_init(struct p { struct arch_msix *msix = dev->msix; struct msi_desc *entry = NULL; - int vf; u16 control; u64 table_paddr; u32 table_offset; - u8 bir, pbus, pslot, pfunc; u16 seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); - bool maskall = msix->host_maskall; + bool maskall = msix->host_maskall, zap_on_error = false; unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); @@ -822,43 +820,45 @@ static int msix_capability_init(struct p /* Locate MSI-X table region */ table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos)); - bir = (u8)(table_offset & PCI_MSIX_BIRMASK); - table_offset &= ~PCI_MSIX_BIRMASK; + if ( !msix->used_entries && + (!msi || + (is_hardware_domain(current->domain) && + (dev->domain == current->domain || dev->domain == dom_io))) ) + { + unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc; + int vf; + paddr_t pba_paddr; + unsigned int pba_offset; - if ( !dev->info.is_virtfn ) - { - pbus = bus; - pslot = slot; - pfunc = func; - vf = -1; - } - else - { - pbus = dev->info.physfn.bus; - pslot = PCI_SLOT(dev->info.physfn.devfn); - pfunc = PCI_FUNC(dev->info.physfn.devfn); - vf = PCI_BDF2(dev->bus, dev->devfn); - } - - table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); - WARN_ON(msi && msi->table_base != table_paddr); - if ( !table_paddr ) - { - if ( !msi || !msi->table_base ) + if ( !dev->info.is_virtfn ) { - pci_conf_write16(dev->sbdf, msix_control_reg(pos), - control & ~PCI_MSIX_FLAGS_ENABLE); - xfree(entry); - return -ENXIO; + pbus = bus; + pslot = slot; + pfunc = func; + vf = -1; + } + else + { + pbus = dev->info.physfn.bus; + pslot = PCI_SLOT(dev->info.physfn.devfn); + pfunc = PCI_FUNC(dev->info.physfn.devfn); + vf = PCI_BDF2(dev->bus, dev->devfn); } - table_paddr = msi->table_base; - } - table_paddr += table_offset; - if ( !msix->used_entries ) - { - u64 pba_paddr; - u32 pba_offset; + table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); + WARN_ON(msi && msi->table_base != table_paddr); + if ( !table_paddr ) + { + if ( !msi || !msi->table_base ) + { + pci_conf_write16(dev->sbdf, msix_control_reg(pos), + control & ~PCI_MSIX_FLAGS_ENABLE); + xfree(entry); + return -ENXIO; + } + table_paddr = msi->table_base; + } + table_paddr += table_offset & ~PCI_MSIX_BIRMASK; msix->table.first = PFN_DOWN(table_paddr); msix->table.last = PFN_DOWN(table_paddr + @@ -877,7 +877,18 @@ static int msix_capability_init(struct p BITS_TO_LONGS(msix->nr_entries) - 1); WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first, msix->pba.last)); + + zap_on_error = true; + } + else if ( !msix->table.first ) + { + pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); + xfree(entry); + return -ENODATA; } + else + table_paddr = (msix->table.first << PAGE_SHIFT) + + PAGE_OFFSET(table_offset & ~PCI_MSIX_BIRMASK); if ( entry ) { @@ -888,8 +899,15 @@ static int msix_capability_init(struct p if ( idx < 0 ) { - pci_conf_write16(dev->sbdf, msix_control_reg(pos), - control & ~PCI_MSIX_FLAGS_ENABLE); + if ( zap_on_error ) + { + msix->table.first = 0; + msix->pba.first = 0; + + control &= ~PCI_MSIX_FLAGS_ENABLE; + } + + pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); xfree(entry); return idx; } @@ -1078,9 +1096,14 @@ static void _pci_cleanup_msix(struct arc if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first, msix->table.last) ) WARN(); + msix->table.first = 0; + msix->table.last = 0; + if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first, msix->pba.last) ) WARN(); + msix->pba.first = 0; + msix->pba.last = 0; } } ++++++ 5f6a062c-evtchn-relax-port_is_valid.patch -> 5f6a00c4-evtchn-relax-port_is_valid.patch ++++++ --- /work/SRC/openSUSE:Factory/xen/5f6a062c-evtchn-relax-port_is_valid.patch 2020-09-25 16:21:23.363351410 +0200 +++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a00c4-evtchn-relax-port_is_valid.patch 2020-11-02 09:38:38.753509315 +0100 @@ -1,7 +1,8 @@ -Subject: evtchn: relax port_is_valid() -From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:11:56 2020 +0200 -Date: Tue Sep 22 16:11:56 2020 +0200: -Git: e417504febf4ef5cc3442ebaa00da1a5c7939b22 +# Commit e59ce972d1280c6c55065da822e0860845582053 +# Date 2020-09-22 15:48:52 +0200 +# Author Jan Beulich <jbeulich@suse.com> +# Committer Jan Beulich <jbeulich@suse.com> +evtchn: relax port_is_valid() To avoid ports potentially becoming invalid behind the back of certain other functions (due to ->max_evtchn shrinking) because of @@ -29,11 +30,9 @@ Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Julien Grall <jgrall@amazon.com> -diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h -index a7798f6765..ce45298377 100644 --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h -@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); +@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct static inline bool_t port_is_valid(struct domain *d, unsigned int p) { ++++++ 5f6a00df-x86-PV-avoid-double-exception-injection.patch ++++++ # Commit 910093d54fc758e7d69261b344fdc8da3a7bd81e # Date 2020-09-22 15:49:19 +0200 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/pv: Avoid double exception injection There is at least one path (SYSENTER with NT set, Xen converts to #GP) which ends up injecting the #GP fault twice, first in compat_sysenter(), and then a second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left in TRAPBOUNCE_flags. The guest kernel sees the second fault first, which is a kernel level #GP pointing at the head of the #GP handler, and is therefore a userspace trigger-able DoS. This particular bug has bitten us several times before, so rearrange {compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than leaving this task to one area of code which isn't used uniformly. Other scenarios which might result in a double injection (e.g. two calls directly to compat_create_bounce_frame) will now crash the guest, which is far more obvious than letting the kernel run with corrupt state. This is XSA-339 Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -78,7 +78,6 @@ compat_process_softirqs: sti .Lcompat_bounce_exception: call compat_create_bounce_frame - movb $0, TRAPBOUNCE_flags(%rdx) jmp compat_test_all_events ALIGN @@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_select movl %eax,UREGS_cs+8(%rsp) movl TRAPBOUNCE_eip(%rdx),%eax movl %eax,UREGS_rip+8(%rsp) + + /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ + xor %eax, %eax + mov %ax, TRAPBOUNCE_cs(%rdx) + mov %al, TRAPBOUNCE_flags(%rdx) ret + .section .fixup,"ax" .Lfx13: xorl %edi,%edi --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -90,7 +90,6 @@ process_softirqs: sti .Lbounce_exception: call create_bounce_frame - movb $0, TRAPBOUNCE_flags(%rdx) jmp test_all_events ALIGN @@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_ba jmp asm_domain_crash_synchronous /* Does not return */ __UNLIKELY_END(create_bounce_frame_bad_bounce_ip) movq %rax,UREGS_rip+8(%rsp) + + /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ + xor %eax, %eax + mov %rax, TRAPBOUNCE_eip(%rdx) + mov %al, TRAPBOUNCE_flags(%rdx) ret .pushsection .fixup, "ax", @progbits ++++++ 5f6a00f4-evtchn-add-missing-barriers.patch ++++++ # Commit 112992b05b2d2ca63f3c78eefe1cf8d192d7303a # Date 2020-09-22 15:49:40 +0200 # Author Julien Grall <jgrall@amazon.com> # Committer Jan Beulich <jbeulich@suse.com> xen/evtchn: Add missing barriers when accessing/allocating an event channel While the allocation of a bucket is always performed with the per-domain lock, the bucket may be accessed without the lock taken (for instance, see evtchn_send()). Instead such sites relies on port_is_valid() to return a non-zero value when the port has a struct evtchn associated to it. The function will mostly check whether the port is less than d->valid_evtchns as all the buckets/event channels should be allocated up to that point. Unfortunately a compiler is free to re-order the assignment in evtchn_allocate_port() so it would be possible to have d->valid_evtchns updated before the new bucket has finish to allocate. Additionally on Arm, even if this was compiled "correctly", the processor can still re-order the memory access. Add a write memory barrier in the allocation side and a read memory barrier when the port is valid to prevent any re-ordering issue. This is XSA-340. Reported-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Julien Grall <jgrall@amazon.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain * return -ENOMEM; bucket_from_port(d, port) = chn; + /* + * d->valid_evtchns is used to check whether the bucket can be + * accessed without the per-domain lock. Therefore, + * d->valid_evtchns should be seen *after* the new bucket has + * been setup. + */ + smp_wmb(); write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); } --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h @@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct static inline bool_t port_is_valid(struct domain *d, unsigned int p) { - return p < read_atomic(&d->valid_evtchns); + if ( p >= read_atomic(&d->valid_evtchns) ) + return false; + + /* + * The caller will usually access the event channel afterwards and + * may be done without taking the per-domain lock. The barrier is + * going in pair the smp_wmb() barrier in evtchn_allocate_port(). + */ + smp_rmb(); + + return true; } static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) ++++++ 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch -> 5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch ++++++ --- /work/SRC/openSUSE:Factory/xen/5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch 2020-09-25 16:21:28.159355656 +0200 +++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a0111-evtchn-x86-enforce-correct-upper-limit.patch 2020-11-02 09:38:38.825509384 +0100 @@ -1,7 +1,8 @@ -Subject: evtchn/x86: enforce correct upper limit for 32-bit guests -From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:13:34 2020 +0200 -Date: Tue Sep 22 16:13:34 2020 +0200: -Git: b8c2efbe7b3e8fa5f0b0a3679afccd1204949070 +# Commit 62bcdc4edbf6d8c6e8a25544d48de22ccf75310d +# Date 2020-09-22 15:50:09 +0200 +# Author Jan Beulich <jbeulich@suse.com> +# Committer Jan Beulich <jbeulich@suse.com> +evtchn/x86: enforce correct upper limit for 32-bit guests The recording of d->max_evtchns in evtchn_2l_init(), in particular with the limited set of callers of the function, is insufficient. Neither for @@ -34,11 +35,9 @@ Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Julien Grall <jgrall@amazon.com> -diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c -index e1dbb860f4..a229d35271 100644 --- a/xen/common/event_2l.c +++ b/xen/common/event_2l.c -@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtchn_port_ops_2l = +@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtc void evtchn_2l_init(struct domain *d) { d->evtchn_port_ops = &evtchn_port_ops_2l; @@ -46,11 +45,9 @@ } /* -diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c -index 53c17bd354..08ffe0f063 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c -@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct domain *d, struct evtchn *bucket) +@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct do int evtchn_allocate_port(struct domain *d, evtchn_port_t port) { @@ -59,7 +56,7 @@ return -ENOSPC; if ( port_is_valid(d, port) ) -@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(struct domain *d) +@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(stru spin_lock(&d->event_lock); @@ -74,11 +71,9 @@ chn = evtchn_from_port(d, port); if ( chn->state == ECS_FREE ) continue; -diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c -index 230f440f14..2f13d92128 100644 --- a/xen/common/event_fifo.c +++ b/xen/common/event_fifo.c -@@ -478,7 +478,7 @@ static void cleanup_event_array(struct domain *d) +@@ -478,7 +478,7 @@ static void cleanup_event_array(struct d d->evtchn_fifo = NULL; } @@ -87,7 +82,7 @@ { unsigned int port; -@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d) +@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d * - save its pending state. * - set default priority. */ @@ -96,7 +91,7 @@ { struct evtchn *evtchn; -@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtchn_init_control *init_control) +@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtc if ( !d->evtchn_fifo ) { struct vcpu *vcb; @@ -105,7 +100,7 @@ for_each_vcpu ( d, vcb ) { rc = setup_control_block(vcb); -@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtchn_init_control *init_control) +@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtc goto error; d->evtchn_port_ops = &evtchn_port_ops_fifo; @@ -115,11 +110,9 @@ } else rc = map_control_block(v, gfn, offset); -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index cb49a8bc02..ab94d2ec3a 100644 --- a/xen/common/sched/core.c +++ b/xen/common/sched/core.c -@@ -1428,7 +1428,7 @@ static long do_poll(struct sched_poll *sched_poll) +@@ -1428,7 +1428,7 @@ static long do_poll(struct sched_poll *s goto out; rc = -EINVAL; @@ -128,11 +121,9 @@ goto out; rc = 0; -diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h -index c35f4b23b6..e1b299e8df 100644 --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h -@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); +@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct #define bucket_from_port(d, p) \ ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) @@ -145,8 +136,6 @@ static inline bool_t port_is_valid(struct domain *d, unsigned int p) { if ( p >= read_atomic(&d->valid_evtchns) ) -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index ac53519d7f..545f2bdcd0 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -359,7 +359,6 @@ struct domain ++++++ 5f6a013f-evtchn_reset-shouldnt-succeed-with.patch ++++++ # Commit 8d385b247bca40ece40c9279391054bc98934325 # Date 2020-09-22 15:50:55 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn: evtchn_reset() shouldn't succeed with still-open ports While the function closes all ports, it does so without holding any lock, and hence racing requests may be issued causing new ports to get opened. This would have been problematic in particular if such a newly opened port had a port number above the new implementation limit (i.e. when switching from FIFO to 2-level) after the reset, as prior to "evtchn: relax port_is_valid()" this could have led to e.g. evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger. Introduce a counter of active ports and check that it's (still) no larger then the number of Xen internally used ones after obtaining the necessary lock in evtchn_reset(). As to the access model of the new {active,xen}_evtchns fields - while all writes get done using write_atomic(), reads ought to use read_atomic() only when outside of a suitably locked region. Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have a need to call check_free_port(). This is part of XSA-343. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> Reviewed-by: Julien Grall <jgrall@amazon.com> --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain * write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); } + write_atomic(&d->active_evtchns, d->active_evtchns + 1); + return 0; } @@ -211,11 +213,26 @@ static int get_free_port(struct domain * return -ENOSPC; } +/* + * Check whether a port is still marked free, and if so update the domain + * counter accordingly. To be used on function exit paths. + */ +static void check_free_port(struct domain *d, evtchn_port_t port) +{ + if ( port_is_valid(d, port) && + evtchn_from_port(d, port)->state == ECS_FREE ) + write_atomic(&d->active_evtchns, d->active_evtchns - 1); +} + void evtchn_free(struct domain *d, struct evtchn *chn) { /* Clear pending event to avoid unexpected behavior on re-bind. */ evtchn_port_clear_pending(d, chn); + if ( consumer_is_xen(chn) ) + write_atomic(&d->xen_evtchns, d->xen_evtchns - 1); + write_atomic(&d->active_evtchns, d->active_evtchns - 1); + /* Reset binding to vcpu0 when the channel is freed. */ chn->state = ECS_FREE; chn->notify_vcpu_id = 0; @@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_ alloc->port = port; out: + check_free_port(d, port); spin_unlock(&d->event_lock); rcu_unlock_domain(d); @@ -351,6 +369,7 @@ static long evtchn_bind_interdomain(evtc bind->local_port = lport; out: + check_free_port(ld, lport); spin_unlock(&ld->event_lock); if ( ld != rd ) spin_unlock(&rd->event_lock); @@ -488,7 +507,7 @@ static long evtchn_bind_pirq(evtchn_bind struct domain *d = current->domain; struct vcpu *v = d->vcpu[0]; struct pirq *info; - int port, pirq = bind->pirq; + int port = 0, pirq = bind->pirq; long rc; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) @@ -536,6 +555,7 @@ static long evtchn_bind_pirq(evtchn_bind arch_evtchn_bind_pirq(d, pirq); out: + check_free_port(d, port); spin_unlock(&d->event_lock); return rc; @@ -1011,10 +1031,10 @@ int evtchn_unmask(unsigned int port) return 0; } - int evtchn_reset(struct domain *d) { unsigned int i; + int rc = 0; if ( d != current->domain && !d->controller_pause_count ) return -EINVAL; @@ -1024,7 +1044,9 @@ int evtchn_reset(struct domain *d) spin_lock(&d->event_lock); - if ( d->evtchn_fifo ) + if ( d->active_evtchns > d->xen_evtchns ) + rc = -EAGAIN; + else if ( d->evtchn_fifo ) { /* Switching back to 2-level ABI. */ evtchn_fifo_destroy(d); @@ -1033,7 +1055,7 @@ int evtchn_reset(struct domain *d) spin_unlock(&d->event_lock); - return 0; + return rc; } static long evtchn_set_priority(const struct evtchn_set_priority *set_priority) @@ -1219,10 +1241,9 @@ int alloc_unbound_xen_event_channel( spin_lock(&ld->event_lock); - rc = get_free_port(ld); + port = rc = get_free_port(ld); if ( rc < 0 ) goto out; - port = rc; chn = evtchn_from_port(ld, port); rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid); @@ -1238,7 +1259,10 @@ int alloc_unbound_xen_event_channel( spin_unlock(&chn->lock); + write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); + out: + check_free_port(ld, port); spin_unlock(&ld->event_lock); return rc < 0 ? rc : port; @@ -1314,6 +1338,7 @@ int evtchn_init(struct domain *d, unsign return -EINVAL; } evtchn_from_port(d, 0)->state = ECS_RESERVED; + write_atomic(&d->active_evtchns, 0); #if MAX_VIRT_CPUS > BITS_PER_LONG d->poll_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus)); @@ -1340,6 +1365,8 @@ void evtchn_destroy(struct domain *d) for ( i = 0; port_is_valid(d, i); i++ ) evtchn_close(d, i, 0); + ASSERT(!d->active_evtchns); + clear_global_virq_handlers(d); evtchn_fifo_destroy(d); --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -361,6 +361,16 @@ struct domain struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ unsigned int max_evtchn_port; /* max permitted port number */ unsigned int valid_evtchns; /* number of allocated event channels */ + /* + * Number of in-use event channels. Writers should use write_atomic(). + * Readers need to use read_atomic() only when not holding event_lock. + */ + unsigned int active_evtchns; + /* + * Number of event channels used internally by Xen (not subject to + * EVTCHNOP_reset). Read/write access like for active_evtchns. + */ + unsigned int xen_evtchns; spinlock_t event_lock; const struct evtchn_port_ops *evtchn_port_ops; struct evtchn_fifo_domain *evtchn_fifo; ++++++ 5f6a0160-evtchn-IRQ-safe-per-channel-lock.patch ++++++ # Commit c0ddc8634845aba50774add6e4b73fdaffc82656 # Date 2020-09-22 15:51:28 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn: convert per-channel lock to be IRQ-safe ... in order for send_guest_{global,vcpu}_virq() to be able to make use of it. This is part of XSA-343. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_ int port; domid_t dom = alloc->dom; long rc; + unsigned long flags; d = rcu_lock_domain_by_any_id(dom); if ( d == NULL ) @@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_ if ( rc ) goto out; - spin_lock(&chn->lock); + spin_lock_irqsave(&chn->lock, flags); chn->state = ECS_UNBOUND; if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) chn->u.unbound.remote_domid = current->domain->domain_id; evtchn_port_init(d, chn); - spin_unlock(&chn->lock); + spin_unlock_irqrestore(&chn->lock, flags); alloc->port = port; @@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_ } -static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) +static unsigned long double_evtchn_lock(struct evtchn *lchn, + struct evtchn *rchn) { - if ( lchn < rchn ) + unsigned long flags; + + if ( lchn <= rchn ) { - spin_lock(&lchn->lock); - spin_lock(&rchn->lock); + spin_lock_irqsave(&lchn->lock, flags); + if ( lchn != rchn ) + spin_lock(&rchn->lock); } else { - if ( lchn != rchn ) - spin_lock(&rchn->lock); + spin_lock_irqsave(&rchn->lock, flags); spin_lock(&lchn->lock); } + + return flags; } -static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn) +static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn, + unsigned long flags) { - spin_unlock(&lchn->lock); if ( lchn != rchn ) - spin_unlock(&rchn->lock); + spin_unlock(&lchn->lock); + spin_unlock_irqrestore(&rchn->lock, flags); } static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) @@ -312,6 +319,7 @@ static long evtchn_bind_interdomain(evtc int lport, rport = bind->remote_port; domid_t rdom = bind->remote_dom; long rc; + unsigned long flags; if ( rdom == DOMID_SELF ) rdom = current->domain->domain_id; @@ -347,7 +355,7 @@ static long evtchn_bind_interdomain(evtc if ( rc ) goto out; - double_evtchn_lock(lchn, rchn); + flags = double_evtchn_lock(lchn, rchn); lchn->u.interdomain.remote_dom = rd; lchn->u.interdomain.remote_port = rport; @@ -364,7 +372,7 @@ static long evtchn_bind_interdomain(evtc */ evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn); - double_evtchn_unlock(lchn, rchn); + double_evtchn_unlock(lchn, rchn, flags); bind->local_port = lport; @@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t struct domain *d = current->domain; int virq = bind->virq, vcpu = bind->vcpu; int rc = 0; + unsigned long flags; if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) return -EINVAL; @@ -424,14 +433,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t chn = evtchn_from_port(d, port); - spin_lock(&chn->lock); + spin_lock_irqsave(&chn->lock, flags); chn->state = ECS_VIRQ; chn->notify_vcpu_id = vcpu; chn->u.virq = virq; evtchn_port_init(d, chn); - spin_unlock(&chn->lock); + spin_unlock_irqrestore(&chn->lock, flags); v->virq_to_evtchn[virq] = bind->port = port; @@ -448,6 +457,7 @@ static long evtchn_bind_ipi(evtchn_bind_ struct domain *d = current->domain; int port, vcpu = bind->vcpu; long rc = 0; + unsigned long flags; if ( domain_vcpu(d, vcpu) == NULL ) return -ENOENT; @@ -459,13 +469,13 @@ static long evtchn_bind_ipi(evtchn_bind_ chn = evtchn_from_port(d, port); - spin_lock(&chn->lock); + spin_lock_irqsave(&chn->lock, flags); chn->state = ECS_IPI; chn->notify_vcpu_id = vcpu; evtchn_port_init(d, chn); - spin_unlock(&chn->lock); + spin_unlock_irqrestore(&chn->lock, flags); bind->port = port; @@ -509,6 +519,7 @@ static long evtchn_bind_pirq(evtchn_bind struct pirq *info; int port = 0, pirq = bind->pirq; long rc; + unsigned long flags; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; @@ -541,14 +552,14 @@ static long evtchn_bind_pirq(evtchn_bind goto out; } - spin_lock(&chn->lock); + spin_lock_irqsave(&chn->lock, flags); chn->state = ECS_PIRQ; chn->u.pirq.irq = pirq; link_pirq_port(port, chn, v); evtchn_port_init(d, chn); - spin_unlock(&chn->lock); + spin_unlock_irqrestore(&chn->lock, flags); bind->port = port; @@ -569,6 +580,7 @@ int evtchn_close(struct domain *d1, int struct evtchn *chn1, *chn2; int port2; long rc = 0; + unsigned long flags; again: spin_lock(&d1->event_lock); @@ -668,14 +680,14 @@ int evtchn_close(struct domain *d1, int BUG_ON(chn2->state != ECS_INTERDOMAIN); BUG_ON(chn2->u.interdomain.remote_dom != d1); - double_evtchn_lock(chn1, chn2); + flags = double_evtchn_lock(chn1, chn2); evtchn_free(d1, chn1); chn2->state = ECS_UNBOUND; chn2->u.unbound.remote_domid = d1->domain_id; - double_evtchn_unlock(chn1, chn2); + double_evtchn_unlock(chn1, chn2, flags); goto out; @@ -683,9 +695,9 @@ int evtchn_close(struct domain *d1, int BUG(); } - spin_lock(&chn1->lock); + spin_lock_irqsave(&chn1->lock, flags); evtchn_free(d1, chn1); - spin_unlock(&chn1->lock); + spin_unlock_irqrestore(&chn1->lock, flags); out: if ( d2 != NULL ) @@ -705,13 +717,14 @@ int evtchn_send(struct domain *ld, unsig struct evtchn *lchn, *rchn; struct domain *rd; int rport, ret = 0; + unsigned long flags; if ( !port_is_valid(ld, lport) ) return -EINVAL; lchn = evtchn_from_port(ld, lport); - spin_lock(&lchn->lock); + spin_lock_irqsave(&lchn->lock, flags); /* Guest cannot send via a Xen-attached event channel. */ if ( unlikely(consumer_is_xen(lchn)) ) @@ -746,7 +759,7 @@ int evtchn_send(struct domain *ld, unsig } out: - spin_unlock(&lchn->lock); + spin_unlock_irqrestore(&lchn->lock, flags); return ret; } @@ -1238,6 +1251,7 @@ int alloc_unbound_xen_event_channel( { struct evtchn *chn; int port, rc; + unsigned long flags; spin_lock(&ld->event_lock); @@ -1250,14 +1264,14 @@ int alloc_unbound_xen_event_channel( if ( rc ) goto out; - spin_lock(&chn->lock); + spin_lock_irqsave(&chn->lock, flags); chn->state = ECS_UNBOUND; chn->xen_consumer = get_xen_consumer(notification_fn); chn->notify_vcpu_id = lvcpu; chn->u.unbound.remote_domid = remote_domid; - spin_unlock(&chn->lock); + spin_unlock_irqrestore(&chn->lock, flags); write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); @@ -1280,11 +1294,12 @@ void notify_via_xen_event_channel(struct { struct evtchn *lchn, *rchn; struct domain *rd; + unsigned long flags; ASSERT(port_is_valid(ld, lport)); lchn = evtchn_from_port(ld, lport); - spin_lock(&lchn->lock); + spin_lock_irqsave(&lchn->lock, flags); if ( likely(lchn->state == ECS_INTERDOMAIN) ) { @@ -1294,7 +1309,7 @@ void notify_via_xen_event_channel(struct evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn); } - spin_unlock(&lchn->lock); + spin_unlock_irqrestore(&lchn->lock, flags); } void evtchn_check_pollers(struct domain *d, unsigned int port) ++++++ 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch -> 5f6a0178-evtchn-address-races-with-evtchn_reset.patch ++++++ --- /work/SRC/openSUSE:Factory/xen/5f6a06f2-evtchn-address-races-with-evtchn_reset.patch 2020-09-25 16:21:30.175357441 +0200 +++ /work/SRC/openSUSE:Factory/.xen.new.3463/5f6a0178-evtchn-address-races-with-evtchn_reset.patch 2020-11-02 09:38:38.893509450 +0100 @@ -1,7 +1,8 @@ -Subject: evtchn: address races with evtchn_reset() -From: Jan Beulich jbeulich@suse.com Tue Sep 22 16:15:14 2020 +0200 -Date: Tue Sep 22 16:15:14 2020 +0200: -Git: ecc6428b7ea63a24e244f747e8568c0ccc03a6f8 +# Commit e045199c7c9c5433d7f1461a741ed539a75cbfad +# Date 2020-09-22 15:51:52 +0200 +# Author Jan Beulich <jbeulich@suse.com> +# Committer Jan Beulich <jbeulich@suse.com> +evtchn: address races with evtchn_reset() Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely lock-less manner, as both may change by a racing evtchn_reset(). In the @@ -29,8 +30,6 @@ Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> -diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c -index a69937c840..93c4fb9a79 100644 --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -2488,14 +2488,24 @@ static void dump_irqs(unsigned char key) @@ -62,11 +61,9 @@ i < action->nr_guests ? ',' : '\n'); } } -diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c -index 3a0525c209..9aef7a860a 100644 --- a/xen/arch/x86/pv/shim.c +++ b/xen/arch/x86/pv/shim.c -@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int port) +@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int if ( port_is_valid(guest, port) ) { struct evtchn *chn = evtchn_from_port(guest, port); @@ -78,11 +75,9 @@ } } -diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c -index a229d35271..083d04be3c 100644 --- a/xen/common/event_2l.c +++ b/xen/common/event_2l.c -@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct domain *d, struct evtchn *evtchn) +@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct doma } } @@ -94,7 +89,7 @@ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); ASSERT(port < max_ports); -@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port) +@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const s guest_test_bit(d, port, &shared_info(d, evtchn_pending))); } @@ -106,11 +101,9 @@ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); ASSERT(port < max_ports); -diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c -index 0e550e9c7a..878e4250ed 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c -@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain *d, evtchn_port_t port) +@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain * if ( port_is_valid(d, port) ) { @@ -122,7 +115,7 @@ return -EBUSY; } else -@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq) +@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v unsigned long flags; int port; struct domain *d; @@ -130,7 +123,7 @@ ASSERT(!virq_is_global(virq)); -@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq) +@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v goto out; d = v->domain; @@ -142,7 +135,7 @@ out: spin_unlock_irqrestore(&v->virq_lock, flags); -@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domain *d, uint32_t virq) +@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domai goto out; chn = evtchn_from_port(d, port); @@ -152,7 +145,7 @@ out: spin_unlock_irqrestore(&v->virq_lock, flags); -@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, const struct pirq *pirq) +@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, c { int port; struct evtchn *chn; @@ -160,7 +153,7 @@ /* * PV guests: It should not be possible to race with __evtchn_close(). The -@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, const struct pirq *pirq) +@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, c } chn = evtchn_from_port(d, port); @@ -186,7 +179,7 @@ return 0; } -@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(struct domain *d) +@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(stru printk(" %4u [%d/%d/", port, @@ -197,11 +190,9 @@ evtchn_port_print_state(d, chn); printk("]: s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id, chn->xen_consumer); -diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c -index 2f13d92128..68d0c7a632 100644 --- a/xen/common/event_fifo.c +++ b/xen/common/event_fifo.c -@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct domain *d, struct evtchn *evtchn) +@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct do evtchn_fifo_set_pending(v, evtchn); } @@ -234,11 +225,9 @@ return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word); } -diff --git a/xen/include/asm-x86/event.h b/xen/include/asm-x86/event.h -index 98a85233cb..5e09ede6d7 100644 --- a/xen/include/asm-x86/event.h +++ b/xen/include/asm-x86/event.h -@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(unsigned int virq) +@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(u return true; } @@ -249,11 +238,9 @@ +#endif + #endif -diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h -index e1b299e8df..bc9aa68650 100644 --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h -@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) +@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET); } @@ -306,7 +293,7 @@ int (*set_priority)(struct domain *d, struct evtchn *evtchn, unsigned int priority); void (*print_state)(struct domain *d, const struct evtchn *evtchn); -@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pending(struct domain *d, +@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pendi unsigned int vcpu_id, struct evtchn *evtchn) { @@ -386,7 +373,7 @@ } static inline int evtchn_port_set_priority(struct domain *d, -@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priority(struct domain *d, +@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priori { if ( !d->evtchn_port_ops->set_priority ) return -ENOSYS; ++++++ 5f6a01a4-evtchn-preempt-in-evtchn_destroy.patch ++++++ # Commit 8fe7b5f9960f4d9ec46787af394d180c39c6b2db # Date 2020-09-22 15:52:36 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn: arrange for preemption in evtchn_destroy() Especially closing of fully established interdomain channels can take quite some time, due to the locking involved. Therefore we shouldn't assume we can clean up still active ports all in one go. Besides adding the necessary preemption check, also avoid pointlessly starting from (or now really ending at) 0; 1 is the lowest numbered port which may need closing. Since we're now reducing ->valid_evtchns, free_xen_event_channel(), and (at least to be on the safe side) notify_via_xen_event_channel() need to cope with attempts to close / unbind from / send through already closed (and no longer valid, as per port_is_valid()) ports. This is part of XSA-344. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -715,12 +715,14 @@ int domain_kill(struct domain *d) return domain_kill(d); d->is_dying = DOMDYING_dying; argo_destroy(d); - evtchn_destroy(d); gnttab_release_mappings(d); vnuma_destroy(d->vnuma); domain_set_outstanding_pages(d, 0); /* fallthrough */ case DOMDYING_dying: + rc = evtchn_destroy(d); + if ( rc ) + break; rc = domain_relinquish_resources(d); if ( rc != 0 ) break; --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -1297,7 +1297,16 @@ int alloc_unbound_xen_event_channel( void free_xen_event_channel(struct domain *d, int port) { - BUG_ON(!port_is_valid(d, port)); + if ( !port_is_valid(d, port) ) + { + /* + * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing + * with the spin_barrier() and BUG_ON() in evtchn_destroy(). + */ + smp_rmb(); + BUG_ON(!d->is_dying); + return; + } evtchn_close(d, port, 0); } @@ -1309,7 +1318,17 @@ void notify_via_xen_event_channel(struct struct domain *rd; unsigned long flags; - ASSERT(port_is_valid(ld, lport)); + if ( !port_is_valid(ld, lport) ) + { + /* + * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing + * with the spin_barrier() and BUG_ON() in evtchn_destroy(). + */ + smp_rmb(); + ASSERT(ld->is_dying); + return; + } + lchn = evtchn_from_port(ld, lport); spin_lock_irqsave(&lchn->lock, flags); @@ -1380,8 +1399,7 @@ int evtchn_init(struct domain *d, unsign return 0; } - -void evtchn_destroy(struct domain *d) +int evtchn_destroy(struct domain *d) { unsigned int i; @@ -1390,14 +1408,29 @@ void evtchn_destroy(struct domain *d) spin_barrier(&d->event_lock); /* Close all existing event channels. */ - for ( i = 0; port_is_valid(d, i); i++ ) + for ( i = d->valid_evtchns; --i; ) + { evtchn_close(d, i, 0); + /* + * Avoid preempting when called from domain_create()'s error path, + * and don't check too often (choice of frequency is arbitrary). + */ + if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead && + hypercall_preempt_check() ) + { + write_atomic(&d->valid_evtchns, i); + return -ERESTART; + } + } + ASSERT(!d->active_evtchns); clear_global_virq_handlers(d); evtchn_fifo_destroy(d); + + return 0; } --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -138,7 +138,7 @@ struct evtchn } __attribute__((aligned(64))); int evtchn_init(struct domain *d, unsigned int max_port); -void evtchn_destroy(struct domain *d); /* from domain_kill */ +int evtchn_destroy(struct domain *d); /* from domain_kill */ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ struct waitqueue_vcpu; ++++++ 5f6a01c6-evtchn-preempt-in-evtchn_reset.patch ++++++ # Commit 2785b2a9e04abc148e1c5259f4faee708ea356f4 # Date 2020-09-22 15:53:10 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn: arrange for preemption in evtchn_reset() Like for evtchn_destroy() looping over all possible event channels to close them can take a significant amount of time. Unlike done there, we can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a lightweight form, the paging domctl continuation concept, redirecting the continuations to different sub-ops. Just like there this is to be able to allow for predictable overall results of the involved sub-ops: Racing requests should either complete or be refused. Note that a domain can't interfere with an already started (by a remote domain) reset, due to being paused. It can prevent a remote reset from happening by leaving a reset unfinished, but that's only going to affect itself. This is part of XSA-344. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -1145,7 +1145,7 @@ void domain_unpause_except_self(struct d domain_unpause(d); } -int domain_soft_reset(struct domain *d) +int domain_soft_reset(struct domain *d, bool resuming) { struct vcpu *v; int rc; @@ -1159,7 +1159,7 @@ int domain_soft_reset(struct domain *d) } spin_unlock(&d->shutdown_lock); - rc = evtchn_reset(d); + rc = evtchn_reset(d, resuming); if ( rc ) return rc; --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -562,12 +562,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe } case XEN_DOMCTL_soft_reset: + case XEN_DOMCTL_soft_reset_cont: if ( d == current->domain ) /* no domain_pause() */ { ret = -EINVAL; break; } - ret = domain_soft_reset(d); + ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont); + if ( ret == -ERESTART ) + { + op->cmd = XEN_DOMCTL_soft_reset_cont; + if ( !__copy_field_to_guest(u_domctl, op, cmd) ) + ret = hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + else + ret = -EFAULT; + } break; case XEN_DOMCTL_destroydomain: --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -1057,7 +1057,7 @@ int evtchn_unmask(unsigned int port) return 0; } -int evtchn_reset(struct domain *d) +int evtchn_reset(struct domain *d, bool resuming) { unsigned int i; int rc = 0; @@ -1065,11 +1065,40 @@ int evtchn_reset(struct domain *d) if ( d != current->domain && !d->controller_pause_count ) return -EINVAL; - for ( i = 0; port_is_valid(d, i); i++ ) + spin_lock(&d->event_lock); + + /* + * If we are resuming, then start where we stopped. Otherwise, check + * that a reset operation is not already in progress, and if none is, + * record that this is now the case. + */ + i = resuming ? d->next_evtchn : !d->next_evtchn; + if ( i > d->next_evtchn ) + d->next_evtchn = i; + + spin_unlock(&d->event_lock); + + if ( !i ) + return -EBUSY; + + for ( ; port_is_valid(d, i); i++ ) + { evtchn_close(d, i, 1); + /* NB: Choice of frequency is arbitrary. */ + if ( !(i & 0x3f) && hypercall_preempt_check() ) + { + spin_lock(&d->event_lock); + d->next_evtchn = i; + spin_unlock(&d->event_lock); + return -ERESTART; + } + } + spin_lock(&d->event_lock); + d->next_evtchn = 0; + if ( d->active_evtchns > d->xen_evtchns ) rc = -EAGAIN; else if ( d->evtchn_fifo ) @@ -1204,7 +1233,8 @@ long do_event_channel_op(int cmd, XEN_GU break; } - case EVTCHNOP_reset: { + case EVTCHNOP_reset: + case EVTCHNOP_reset_cont: { struct evtchn_reset reset; struct domain *d; @@ -1217,9 +1247,13 @@ long do_event_channel_op(int cmd, XEN_GU rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d); if ( !rc ) - rc = evtchn_reset(d); + rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont); rcu_unlock_domain(d); + + if ( rc == -ERESTART ) + rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op, + "ih", EVTCHNOP_reset_cont, arg); break; } --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -1159,7 +1159,10 @@ struct xen_domctl { #define XEN_DOMCTL_iomem_permission 20 #define XEN_DOMCTL_ioport_permission 21 #define XEN_DOMCTL_hypercall_init 22 -#define XEN_DOMCTL_arch_setup 23 /* Obsolete IA64 only */ +#ifdef __XEN__ +/* #define XEN_DOMCTL_arch_setup 23 Obsolete IA64 only */ +#define XEN_DOMCTL_soft_reset_cont 23 +#endif #define XEN_DOMCTL_settimeoffset 24 #define XEN_DOMCTL_getvcpuaffinity 25 #define XEN_DOMCTL_real_mode_area 26 /* Obsolete PPC only */ --- a/xen/include/public/event_channel.h +++ b/xen/include/public/event_channel.h @@ -74,6 +74,9 @@ #define EVTCHNOP_init_control 11 #define EVTCHNOP_expand_array 12 #define EVTCHNOP_set_priority 13 +#ifdef __XEN__ +#define EVTCHNOP_reset_cont 14 +#endif /* ` } */ typedef uint32_t evtchn_port_t; --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h @@ -179,7 +179,7 @@ void evtchn_check_pollers(struct domain void evtchn_2l_init(struct domain *d); /* Close all event channels and reset to 2-level ABI. */ -int evtchn_reset(struct domain *d); +int evtchn_reset(struct domain *d, bool resuming); /* * Low-level event channel port ops. --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -371,6 +371,8 @@ struct domain * EVTCHNOP_reset). Read/write access like for active_evtchns. */ unsigned int xen_evtchns; + /* Port to resume from in evtchn_reset(), when in a continuation. */ + unsigned int next_evtchn; spinlock_t event_lock; const struct evtchn_port_ops *evtchn_port_ops; struct evtchn_fifo_domain *evtchn_fifo; @@ -663,7 +665,7 @@ int domain_kill(struct domain *d); int domain_shutdown(struct domain *d, u8 reason); void domain_resume(struct domain *d); -int domain_soft_reset(struct domain *d); +int domain_soft_reset(struct domain *d, bool resuming); int vcpu_start_shutdown_deferral(struct vcpu *v); void vcpu_end_shutdown_deferral(struct vcpu *v); ++++++ 5f6cfb5b-x86-PV-dont-GP-for-SYSENTER-with-NT-set.patch ++++++ # Commit 61d4a04349895edc5a5868274b906ba61ef24f47 # Date 2020-09-24 21:02:35 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Don't deliver #GP for a SYSENTER with NT set It is a matter of guest kernel policy what to do with offending userspace, and terminating said userspace may not be the action chosen. Linux explicitly tolerates this case. Reported-by: Andy Lutomirski <luto@kernel.org> Fixes: fdac951560 ("x86: clear EFLAGS.NT in SYSENTER entry path") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -313,7 +313,6 @@ UNLIKELY_START(nz, sysenter_nt_set) pushfq andl $~X86_EFLAGS_NT,(%rsp) popfq - xorl %eax,%eax UNLIKELY_END(sysenter_nt_set) testq %rax,%rax leal (,%rcx,TBF_INTERRUPT),%ecx ++++++ 5f6cfb5b-x86-PV-dont-clobber-NT-on-return-to-guest.patch ++++++ # Commit 5bcac985498ed83d89666959175ca9c9ed561ae1 # Date 2020-09-24 21:02:35 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/pv: Don't clobber NT on return-to-guest A 64bit IRET can restore NT - the faulting case is when NT is set in the live flags. This change had an unintended consequence of causing the NT flag to spontaneously disappear from guest context whenever a interrupt/exception occurred. In combination with a SYSENTER which sets both TF and NT, Xen's handling of the #DB exceptions clears NT before it is even recorded suitably in the guest kernel's view of what userspace was doing. Reported-by: Andy Lutomirski <luto@kernel.org> Fixes: 0e47f92b0 ("x86: force EFLAGS.IF on when exiting to PV guests") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -119,7 +119,7 @@ compat_process_trap: /* %rbx: struct vcpu, interrupts disabled */ ENTRY(compat_restore_all_guest) ASSERT_INTERRUPTS_DISABLED - mov $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),%r11d + mov $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11d and UREGS_eflags(%rsp),%r11d .macro alt_cr4_pv32 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -182,7 +182,7 @@ restore_all_guest: jz iret_exit_to_guest movq 24(%rsp),%r11 # RFLAGS - andq $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),%r11 + andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 orq $X86_EFLAGS_IF,%r11 /* Don't use SYSRET path if the return address is not canonical. */ @@ -213,7 +213,7 @@ restore_all_guest: movq 8(%rsp), %rcx # RIP /* No special register assumptions. */ iret_exit_to_guest: - andl $~(X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_VM),24(%rsp) + andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) orl $X86_EFLAGS_IF,24(%rsp) addq $8,%rsp .Lft0: iretq ++++++ 5f71a21e-x86-S3-fix-shadow-stack-resume.patch ++++++ # Commit 4bdbf746ac9152e70f264f87db4472707da805ce # Date 2020-09-28 10:43:10 +0200 # Author Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> # Committer Jan Beulich <jbeulich@suse.com> x86/S3: fix shadow stack resume path Fix the resume path to load the shadow stack pointer from saved_ssp (not saved_rsp), to match what suspend path does. Fixes: 633ecc4a7cb2 ("x86/S3: Save and restore Shadow Stack configuration") Backport: 4.14 Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/acpi/wakeup_prot.S +++ b/xen/arch/x86/acpi/wakeup_prot.S @@ -69,7 +69,7 @@ ENTRY(s3_resume) * so SETSSBSY will successfully load a value useful for us, then * reset MSR_PL0_SSP to its usual value and pop the temporary token. */ - mov saved_rsp(%rip), %rdi + mov saved_ssp(%rip), %rdi cmpq $1, %rdi je .L_shstk_done ++++++ 5f76ca65-evtchn-Flask-prealloc-for-send.patch ++++++ # Commit 52e1fc47abc3a0123d2b5bb7e9172e84fd571851 # Date 2020-10-02 08:36:21 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn/Flask: pre-allocate node on send path xmalloc() & Co may not be called with IRQs off, or else check_lock() will have its assertion trigger about locks getting acquired inconsistently. Re-arranging the locking in evtchn_send() doesn't seem very reasonable, especially since the per-channel lock was introduced to avoid acquiring the per-domain event lock on the send paths. Issue a second call to xsm_evtchn_send() instead, before acquiring the lock, to give XSM / Flask a chance to pre-allocate whatever it may need. As these nodes are used merely for caching earlier decisions' results, allocate just one node in AVC code despite two potentially being needed. Things will merely be not as performant if a second allocation was wanted, just like when the pre-allocation fails. Fixes: c0ddc8634845 ("evtchn: convert per-channel lock to be IRQ-safe") Signed-off-by: Jan Beulich <jbeulich@suse.com> Tested-by: Jason Andryuk <jandryuk@gmail.com> Acked-by: Julien Grall <jgrall@amazon.com> Reviewed-by: Jason Andryuk <jandryuk@gmail.com> --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -723,6 +723,12 @@ int evtchn_send(struct domain *ld, unsig if ( !port_is_valid(ld, lport) ) return -EINVAL; + /* + * As the call further down needs to avoid allocations (due to running + * with IRQs off), give XSM a chance to pre-allocate if needed. + */ + xsm_evtchn_send(XSM_HOOK, ld, NULL); + lchn = evtchn_from_port(ld, lport); spin_lock_irqsave(&lchn->lock, flags); --- a/xen/include/xsm/xsm.h +++ b/xen/include/xsm/xsm.h @@ -59,6 +59,7 @@ struct xsm_operations { int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1, struct domain *d2, struct evtchn *chn2); void (*evtchn_close_post) (struct evtchn *chn); + /* Note: Next hook may be called with 'chn' set to NULL. See call site. */ int (*evtchn_send) (struct domain *d, struct evtchn *chn); int (*evtchn_status) (struct domain *d, struct evtchn *chn); int (*evtchn_reset) (struct domain *d1, struct domain *d2); --- a/xen/xsm/flask/avc.c +++ b/xen/xsm/flask/avc.c @@ -24,7 +24,9 @@ #include <xen/prefetch.h> #include <xen/kernel.h> #include <xen/sched.h> +#include <xen/cpu.h> #include <xen/init.h> +#include <xen/percpu.h> #include <xen/rcupdate.h> #include <asm/atomic.h> #include <asm/current.h> @@ -341,17 +343,79 @@ static inline int avc_reclaim_node(void) return ecx; } +static struct avc_node *new_node(void) +{ + struct avc_node *node = xzalloc(struct avc_node); + + if ( node ) + { + INIT_RCU_HEAD(&node->rhead); + INIT_HLIST_NODE(&node->list); + avc_cache_stats_incr(allocations); + } + + return node; +} + +/* + * avc_has_perm_noaudit() may consume up to two nodes, which we may not be + * able to obtain from the allocator at that point. Since the is merely + * about caching earlier decisions, allow for (just) one pre-allocated node. + */ +static DEFINE_PER_CPU(struct avc_node *, prealloc_node); + +void avc_prealloc(void) +{ + struct avc_node **prealloc = &this_cpu(prealloc_node); + + if ( !*prealloc ) + *prealloc = new_node(); +} + +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct avc_node **prealloc = &per_cpu(prealloc_node, cpu); + + if ( action == CPU_DEAD && *prealloc ) + { + xfree(*prealloc); + *prealloc = NULL; + avc_cache_stats_incr(frees); + } + + return NOTIFY_DONE; +} + +static struct notifier_block cpu_nfb = { + .notifier_call = cpu_callback, + .priority = 99 +}; + +static int __init cpu_nfb_init(void) +{ + register_cpu_notifier(&cpu_nfb); + return 0; +} +__initcall(cpu_nfb_init); + static struct avc_node *avc_alloc_node(void) { - struct avc_node *node; + struct avc_node *node, **prealloc = &this_cpu(prealloc_node); - node = xzalloc(struct avc_node); - if (!node) - goto out; - - INIT_RCU_HEAD(&node->rhead); - INIT_HLIST_NODE(&node->list); - avc_cache_stats_incr(allocations); + node = *prealloc; + *prealloc = NULL; + + if ( !node ) + { + /* Must not call xmalloc() & Co with IRQs off. */ + if ( !local_irq_is_enabled() ) + goto out; + node = new_node(); + if ( !node ) + goto out; + } atomic_inc(&avc_cache.active_nodes); if ( atomic_read(&avc_cache.active_nodes) > avc_cache_threshold ) --- a/xen/xsm/flask/hooks.c +++ b/xen/xsm/flask/hooks.c @@ -281,6 +281,16 @@ static int flask_evtchn_send(struct doma { int rc; + /* + * When called with non-NULL chn, memory allocation may not be permitted. + * Allow AVC to preallocate nodes as necessary upon early notification. + */ + if ( !chn ) + { + avc_prealloc(); + return 0; + } + switch ( chn->state ) { case ECS_INTERDOMAIN: --- a/xen/xsm/flask/include/avc.h +++ b/xen/xsm/flask/include/avc.h @@ -91,6 +91,8 @@ int avc_has_perm_noaudit(u32 ssid, u32 t int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct avc_audit_data *auditdata); +void avc_prealloc(void); + /* Exported to selinuxfs */ struct xen_flask_hash_stats; int avc_get_hash_stats(struct xen_flask_hash_stats *arg); ++++++ 5f76caaf-evtchn-FIFO-use-stable-fields.patch ++++++ # Commit 6f6f07b64cbe90e54f8e62b4d6f2404cf5306536 # Date 2020-10-02 08:37:35 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> evtchn/fifo: use stable fields when recording "last queue" information Both evtchn->priority and evtchn->notify_vcpu_id could change behind the back of evtchn_fifo_set_pending(), as for it - in the case of interdomain channels - only the remote side's per-channel lock is held. Neither the queue's priority nor the vCPU's vcpu_id fields have similar properties, so they seem better suited for the purpose. In particular they reflect the respective evtchn fields' values at the time they were used to determine queue and vCPU. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Reviewed-by: Paul Durrant <paul@xen.org> --- a/xen/common/event_fifo.c +++ b/xen/common/event_fifo.c @@ -225,8 +225,8 @@ static void evtchn_fifo_set_pending(stru /* Moved to a different queue? */ if ( old_q != q ) { - evtchn->last_vcpu_id = evtchn->notify_vcpu_id; - evtchn->last_priority = evtchn->priority; + evtchn->last_vcpu_id = v->vcpu_id; + evtchn->last_priority = q->priority; spin_unlock_irqrestore(&old_q->lock, flags); spin_lock_irqsave(&q->lock, flags); ++++++ 5f897c25-x86-traps-fix-read_registers-for-DF.patch ++++++ # Commit 6065a05adf152a556fb9f11a5218c89e41b62893 # Date 2020-10-16 11:55:33 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/traps: 'Fix' safety of read_registers() in #DF path All interrupts and exceptions pass a struct cpu_user_regs up into C. This contains the legacy vm86 fields from 32bit days, which are beyond the hardware-pushed frame. Accessing these fields is generally illegal, as they are logically out of bounds for anything other than an interrupt/exception hitting ring1/3 code. show_registers() unconditionally reads these fields, but the content is discarded before use. This is benign right now, as all parts of the stack are readable, including the guard pages. However, read_registers() in the #DF handler writes to these fields as part of preparing the state dump, and being IST, hits the adjacent stack frame. This has been broken forever, but c/s 6001660473 "x86/shstk: Rework the stack layout to support shadow stacks" repositioned the #DF stack to be adjacent to the guard page, which turns this OoB write into a fatal pagefault: (XEN) *** DOUBLE FAULT *** (XEN) ----[ Xen-4.15-unstable x86_64 debug=y Tainted: C ]---- (XEN) ----[ Xen-4.15-unstable x86_64 debug=y Tainted: C ]---- (XEN) CPU: 4 (XEN) RIP: e008:[<ffff82d04031fd4f>] traps.c#read_registers+0x29/0xc1 (XEN) RFLAGS: 0000000000050086 CONTEXT: hypervisor (d1v0) ... (XEN) Xen call trace: (XEN) [<ffff82d04031fd4f>] R traps.c#read_registers+0x29/0xc1 (XEN) [<ffff82d0403207b3>] F do_double_fault+0x3d/0x7e (XEN) [<ffff82d04039acd7>] F double_fault+0x107/0x110 (XEN) (XEN) Pagetable walk from ffff830236f6d008: (XEN) L4[0x106] = 80000000bfa9b063 ffffffffffffffff (XEN) L3[0x008] = 0000000236ffd063 ffffffffffffffff (XEN) L2[0x1b7] = 0000000236ffc063 ffffffffffffffff (XEN) L1[0x16d] = 8000000236f6d161 ffffffffffffffff (XEN) (XEN) **************************************** (XEN) Panic on CPU 4: (XEN) FATAL PAGE FAULT (XEN) [error_code=0003] (XEN) Faulting linear address: ffff830236f6d008 (XEN) **************************************** (XEN) and rendering the main #DF analysis broken. The proper fix is to delete cpu_user_regs.es and later, so no interrupt/exception path can access OoB, but this needs disentangling from the PV ABI first. Not-really-fixes: 6001660473 ("x86/shstk: Rework the stack layout to support shadow stacks") Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/cpu/common.c +++ b/xen/arch/x86/cpu/common.c @@ -770,7 +770,13 @@ void load_system_tables(void) tss->ist[IST_MCE - 1] = stack_top + (1 + IST_MCE) * PAGE_SIZE; tss->ist[IST_NMI - 1] = stack_top + (1 + IST_NMI) * PAGE_SIZE; tss->ist[IST_DB - 1] = stack_top + (1 + IST_DB) * PAGE_SIZE; - tss->ist[IST_DF - 1] = stack_top + (1 + IST_DF) * PAGE_SIZE; + /* + * Gross bodge. The #DF handler uses the vm86 fields of cpu_user_regs + * beyond the hardware frame. Adjust the stack entrypoint so this + * doesn't manifest as an OoB write which hits the guard page. + */ + tss->ist[IST_DF - 1] = stack_top + (1 + IST_DF) * PAGE_SIZE - + (sizeof(struct cpu_user_regs) - offsetof(struct cpu_user_regs, es)); tss->bitmap = IOBMP_INVALID_OFFSET; /* All other stack pointers poisioned. */ ++++++ 5f897c7b-x86-smpboot-restrict-memguard_guard_stack.patch ++++++ # Commit a7952a320c1e202a218702bfdb14f75132f04894 # Date 2020-10-16 11:56:59 +0100 # Author Andrew Cooper <andrew.cooper3@citrix.com> # Committer Andrew Cooper <andrew.cooper3@citrix.com> x86/smpboot: Don't unconditionally call memguard_guard_stack() in cpu_smpboot_alloc() cpu_smpboot_alloc() is designed to be idempotent with respect to partially initialised state. This occurs for S3 and CPU parking, where enough state to handle NMIs/#MCs needs to remain valid for the entire lifetime of Xen, even when we otherwise want to offline the CPU. For simplicity between various configuration, Xen always uses shadow stack mappings (Read-only + Dirty) for the guard page, irrespective of whether CET-SS is enabled. Unfortunately, the CET-SS changes in memguard_guard_stack() broke idempotency by first writing out the supervisor shadow stack tokens with plain writes, then changing the mapping to being read-only. This ordering is strictly necessary to configure the BSP, which cannot have the supervisor tokens be written with WRSS. Instead of calling memguard_guard_stack() unconditionally, call it only when actually allocating a new stack. Xenheap allocates are guaranteed to be writeable, and the net result is idempotency WRT configuring stack_base[]. Fixes: 91d26ed304f ("x86/shstk: Create shadow stacks") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -997,16 +997,18 @@ static int cpu_smpboot_alloc(unsigned in memflags = MEMF_node(node); if ( stack_base[cpu] == NULL ) + { stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); - if ( stack_base[cpu] == NULL ) - goto out; + if ( !stack_base[cpu] ) + goto out; + + memguard_guard_stack(stack_base[cpu]); + } info = get_cpu_info_from_stack((unsigned long)stack_base[cpu]); info->processor_id = cpu; info->per_cpu_offset = __per_cpu_offset[cpu]; - memguard_guard_stack(stack_base[cpu]); - gdt = per_cpu(gdt, cpu) ?: alloc_xenheap_pages(0, memflags); if ( gdt == NULL ) goto out; ++++++ 5f8ed5d3-x86-mm-map_pages_to_xen-single-exit-path.patch ++++++ # Commit 08e6c6f80b018878476adc2c4e5679d2ce5cb4b1 # Date 2020-10-20 14:19:31 +0200 # Author Wei Liu <wei.liu2@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/mm: Refactor map_pages_to_xen to have only a single exit path We will soon need to perform clean-ups before returning. No functional change. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Wei Liu <wei.liu2@citrix.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Acked-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5088,6 +5088,7 @@ int map_pages_to_xen( l2_pgentry_t *pl2e, ol2e; l1_pgentry_t *pl1e, ol1e; unsigned int i; + int rc = -ENOMEM; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ @@ -5108,7 +5109,8 @@ int map_pages_to_xen( l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) - return -ENOMEM; + goto out; + ol3e = *pl3e; if ( cpu_has_page1gb && @@ -5198,7 +5200,7 @@ int map_pages_to_xen( l2t = alloc_xen_pagetable(); if ( l2t == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(l2t + i, @@ -5227,7 +5229,7 @@ int map_pages_to_xen( pl2e = virt_to_xen_l2e(virt); if ( !pl2e ) - return -ENOMEM; + goto out; if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1u << PAGETABLE_ORDER) - 1)) == 0) && @@ -5271,7 +5273,7 @@ int map_pages_to_xen( { pl1e = virt_to_xen_l1e(virt); if ( pl1e == NULL ) - return -ENOMEM; + goto out; } else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { @@ -5299,7 +5301,7 @@ int map_pages_to_xen( l1t = alloc_xen_pagetable(); if ( l1t == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&l1t[i], @@ -5445,7 +5447,10 @@ int map_pages_to_xen( #undef flush_flags - return 0; + rc = 0; + + out: + return rc; } int populate_pt_range(unsigned long virt, unsigned long nr_mfns) ++++++ 5f8ed5eb-x86-mm-modify_xen_mappings-one-exit-path.patch ++++++ # Commit b733f8a8b8db83f2d438cab3adb38b387cecfce0 # Date 2020-10-20 14:19:55 +0200 # Author Wei Liu <wei.liu2@citrix.com> # Committer Jan Beulich <jbeulich@suse.com> x86/mm: Refactor modify_xen_mappings to have one exit path We will soon need to perform clean-ups before returning. No functional change. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Wei Liu <wei.liu2@citrix.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Acked-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5477,6 +5477,7 @@ int modify_xen_mappings(unsigned long s, l1_pgentry_t *pl1e; unsigned int i; unsigned long v = s; + int rc = -ENOMEM; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT) @@ -5520,7 +5521,8 @@ int modify_xen_mappings(unsigned long s, /* PAGE1GB: shatter the superpage and fall through. */ l2t = alloc_xen_pagetable(); if ( !l2t ) - return -ENOMEM; + goto out; + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(l2t + i, l2e_from_pfn(l3e_get_pfn(*pl3e) + @@ -5577,7 +5579,8 @@ int modify_xen_mappings(unsigned long s, /* PSE: shatter the superpage and try again. */ l1t = alloc_xen_pagetable(); if ( !l1t ) - return -ENOMEM; + goto out; + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&l1t[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, @@ -5710,7 +5713,10 @@ int modify_xen_mappings(unsigned long s, flush_area(NULL, FLUSH_TLB_GLOBAL); #undef FLAGS_MASK - return 0; + rc = 0; + + out: + return rc; } #undef flush_area ++++++ 5f8ed603-x86-mm-prevent-races-in-mapping-updates.patch ++++++ # Commit 1ce75e99d75907aaffae05fcf658a833802bce49 # Date 2020-10-20 14:20:19 +0200 # Author Hongyan Xia <hongyxia@amazon.com> # Committer Jan Beulich <jbeulich@suse.com> x86/mm: Prevent some races in hypervisor mapping updates map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB superpages if possible, to maximize TLB efficiency. This means both replacing superpage entries with smaller entries, and replacing smaller entries with superpages. Unfortunately, while some potential races are handled correctly, others are not. These include: 1. When one processor modifies a sub-superpage mapping while another processor replaces the entire range with a superpage. Take the following example: Suppose L3[N] points to L2. And suppose we have two processors, A and B. * A walks the pagetables, get a pointer to L2. * B replaces L3[N] with a 1GiB mapping. * B Frees L2 * A writes L2[M] # This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't handle higher-level superpages properly: If you call virt_xen_to_l2e on a virtual address within an L3 superpage, you'll either hit a BUG() (most likely), or get a pointer into the middle of a data page; same with virt_xen_to_l1 on a virtual address within either an L3 or L2 superpage. So take the following example: * A reads pl3e and discovers it to point to an L2. * B replaces L3[N] with a 1GiB mapping * A calls virt_to_xen_l2e() and hits the BUG_ON() # 2. When two processors simultaneously try to replace a sub-superpage mapping with a superpage mapping. Take the following example: Suppose L3[N] points to L2. And suppose we have two processors, A and B, both trying to replace L3[N] with a superpage. * A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2. * B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2. * A writes the new value into L3[N] * B writes the new value into L3[N] * A recursively frees all the L1's under L2, then frees L2 * B recursively double-frees all the L1's under L2, then double-frees L2 # Fix this by grabbing a lock for the entirety of the mapping update operation. Rather than grabbing map_pgdir_lock for the entire operation, however, repurpose the PGT_locked bit from L3's page->type_info as a lock. This means that rather than locking the entire address space, we "only" lock a single 512GiB chunk of hypervisor address space at a time. There was a proposal for a lock-and-reverify approach, where we walk the pagetables to the point where we decide what to do; then grab the map_pgdir_lock, re-verify the information we collected without the lock, and finally make the change (starting over again if anything had changed). Without being able to guarantee that the L2 table wasn't freed, however, that means every read would need to be considered potentially unsafe. Thinking carefully about that is probably something that wants to be done on public, not under time pressure. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2088,6 +2088,50 @@ void page_unlock(struct page_info *page) current_locked_page_set(NULL); } +/* + * L3 table locks: + * + * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). + * + * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to + * reuse the PGT_locked flag. This lock is taken only when we move down to L3 + * tables and below, since L4 (and above, for 5-level paging) is still globally + * protected by map_pgdir_lock. + * + * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). + * This has two implications: + * - We cannot reuse reuse current_locked_page_* for debugging + * - To avoid the chance of deadlock, even for different pages, we + * must never grab page_lock() after grabbing l3t_lock(). This + * includes any page_lock()-based locks, such as + * mem_sharing_page_lock(). + * + * Also note that we grab the map_pgdir_lock while holding the + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +static void l3t_lock(struct page_info *page) +{ + unsigned long x, nx; + + do { + while ( (x = page->u.inuse.type_info) & PGT_locked ) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); +} + +static void l3t_unlock(struct page_info *page) +{ + unsigned long x, nx, y = page->u.inuse.type_info; + + do { + x = y; + BUG_ON(!(x & PGT_locked)); + nx = x & ~PGT_locked; + } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); +} + #ifdef CONFIG_PV /* * PTE flags that a guest may change without re-validating the PTE. @@ -5078,6 +5122,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned l flush_area_local((const void *)v, f) : \ flush_area_all((const void *)v, f)) +#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR + +#define L3T_LOCK(page) \ + do { \ + if ( locking ) \ + l3t_lock(page); \ + } while ( false ) + +#define L3T_UNLOCK(page) \ + do { \ + if ( locking && (page) != ZERO_BLOCK_PTR ) \ + { \ + l3t_unlock(page); \ + (page) = ZERO_BLOCK_PTR; \ + } \ + } while ( false ) + int map_pages_to_xen( unsigned long virt, mfn_t mfn, @@ -5089,6 +5150,7 @@ int map_pages_to_xen( l1_pgentry_t *pl1e, ol1e; unsigned int i; int rc = -ENOMEM; + struct page_info *current_l3page; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ @@ -5104,13 +5166,20 @@ int map_pages_to_xen( } \ } while (0) + L3T_INIT(current_l3page); + while ( nr_mfns != 0 ) { - l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); + l3_pgentry_t *pl3e, ol3e; + L3T_UNLOCK(current_l3page); + + pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) goto out; + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); ol3e = *pl3e; if ( cpu_has_page1gb && @@ -5450,6 +5519,7 @@ int map_pages_to_xen( rc = 0; out: + L3T_UNLOCK(current_l3page); return rc; } @@ -5478,6 +5548,7 @@ int modify_xen_mappings(unsigned long s, unsigned int i; unsigned long v = s; int rc = -ENOMEM; + struct page_info *current_l3page; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT) @@ -5486,11 +5557,22 @@ int modify_xen_mappings(unsigned long s, ASSERT(IS_ALIGNED(s, PAGE_SIZE)); ASSERT(IS_ALIGNED(e, PAGE_SIZE)); + L3T_INIT(current_l3page); + while ( v < e ) { - l3_pgentry_t *pl3e = virt_to_xen_l3e(v); + l3_pgentry_t *pl3e; - if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + L3T_UNLOCK(current_l3page); + + pl3e = virt_to_xen_l3e(v); + if ( !pl3e ) + goto out; + + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); + + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { /* Confirm the caller isn't trying to create new mappings. */ ASSERT(!(nf & _PAGE_PRESENT)); @@ -5716,9 +5798,13 @@ int modify_xen_mappings(unsigned long s, rc = 0; out: + L3T_UNLOCK(current_l3page); return rc; } +#undef L3T_LOCK +#undef L3T_UNLOCK + #undef flush_area int destroy_xen_mappings(unsigned long s, unsigned long e) ++++++ 5f8ed635-IOMMU-suppress-iommu_dont_flush_iotlb-when.patch ++++++ # Commit dea460d86957bf1425a8a1572626099ac3f165a8 # Date 2020-10-20 14:21:09 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page Deferring flushes to a single, wide range one - as is done when handling XENMAPSPACE_gmfn_range - is okay only as long as pages don't get freed ahead of the eventual flush. While the only function setting the flag (xenmem_add_to_physmap()) suggests by its name that it's only mapping new entries, in reality the way xenmem_add_to_physmap_one() works means an unmap would happen not only for the page being moved (but not freed) but, if the destination GFN is populated, also for the page being displaced from that GFN. Collapsing the two flushes for this GFN into just one (end even more so deferring it to a batched invocation) is not correct. This is part of XSA-346. Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Julien Grall <jgrall@amazon.com> --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -293,6 +293,7 @@ int guest_remove_page(struct domain *d, p2m_type_t p2mt; #endif mfn_t mfn; + bool *dont_flush_p, dont_flush; int rc; #ifdef CONFIG_X86 @@ -379,8 +380,18 @@ int guest_remove_page(struct domain *d, return -ENXIO; } + /* + * Since we're likely to free the page below, we need to suspend + * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes. + */ + dont_flush_p = &this_cpu(iommu_dont_flush_iotlb); + dont_flush = *dont_flush_p; + *dont_flush_p = false; + rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); + *dont_flush_p = dont_flush; + /* * With the lack of an IOMMU on some platforms, domains with DMA-capable * device must retrieve the same pfn when the hypercall populate_physmap ++++++ 5f8ed64c-IOMMU-hold-page-ref-until-TLB-flush.patch ++++++ # Commit 5777a3742d88ff1c0ebc626ceb4fd47f9b3dc6d5 # Date 2020-10-20 14:21:32 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> IOMMU: hold page ref until after deferred TLB flush When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB flush for the "from" GFN range requires that the page remains allocated to the guest until the TLB flush has actually occurred. Otherwise a parallel hypercall to remove the page would only flush the TLB for the GFN it has been moved to, but not the one is was mapped at originally. This is part of XSA-346. Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") Reported-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> --- a/xen/arch/arm/mm.c +++ b/xen/arch/arm/mm.c @@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn) { @@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one( break; } case XENMAPSPACE_dev_mmio: - /* extra should be 0. Reserved for future use. */ - if ( extra.res0 ) - return -EOPNOTSUPP; - rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx)); return rc; --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -4550,7 +4550,7 @@ static int handle_iomem_range(unsigned l int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gpfn) { @@ -4634,9 +4634,20 @@ int xenmem_add_to_physmap_one( rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); put_both: - /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */ + /* + * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. + * We also may need to transfer ownership of the page reference to our + * caller. + */ if ( space == XENMAPSPACE_gmfn ) + { put_gfn(d, gfn); + if ( !rc && extra.ppage ) + { + *extra.ppage = page; + page = NULL; + } + } if ( page ) put_page(page); --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -815,13 +815,12 @@ int xenmem_add_to_physmap(struct domain { unsigned int done = 0; long rc = 0; - union xen_add_to_physmap_batch_extra extra; + union add_to_physmap_extra extra = {}; + struct page_info *pages[16]; ASSERT(paging_mode_translate(d)); - if ( xatp->space != XENMAPSPACE_gmfn_foreign ) - extra.res0 = 0; - else + if ( xatp->space == XENMAPSPACE_gmfn_foreign ) extra.foreign_domid = DOMID_INVALID; if ( xatp->space != XENMAPSPACE_gmfn_range ) @@ -836,7 +835,10 @@ int xenmem_add_to_physmap(struct domain xatp->size -= start; if ( is_iommu_enabled(d) ) + { this_cpu(iommu_dont_flush_iotlb) = 1; + extra.ppage = &pages[0]; + } while ( xatp->size > done ) { @@ -848,8 +850,12 @@ int xenmem_add_to_physmap(struct domain xatp->idx++; xatp->gpfn++; + if ( extra.ppage ) + ++extra.ppage; + /* Check for continuation if it's not the last iteration. */ - if ( xatp->size > ++done && hypercall_preempt_check() ) + if ( (++done > ARRAY_SIZE(pages) && extra.ppage) || + (xatp->size > done && hypercall_preempt_check()) ) { rc = start + done; break; @@ -859,6 +865,7 @@ int xenmem_add_to_physmap(struct domain if ( is_iommu_enabled(d) ) { int ret; + unsigned int i; this_cpu(iommu_dont_flush_iotlb) = 0; @@ -867,6 +874,15 @@ int xenmem_add_to_physmap(struct domain if ( unlikely(ret) && rc >= 0 ) rc = ret; + /* + * Now that the IOMMU TLB flush was done for the original GFN, drop + * the page references. The 2nd flush below is fine to make later, as + * whoever removes the page again from its new GFN will have to do + * another flush anyway. + */ + for ( i = 0; i < done; ++i ) + put_page(pages[i]); + ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done, IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified); if ( unlikely(ret) && rc >= 0 ) @@ -880,6 +896,8 @@ static int xenmem_add_to_physmap_batch(s struct xen_add_to_physmap_batch *xatpb, unsigned int extent) { + union add_to_physmap_extra extra = {}; + if ( unlikely(xatpb->size < extent) ) return -EILSEQ; @@ -891,6 +909,19 @@ static int xenmem_add_to_physmap_batch(s !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) ) return -EFAULT; + switch ( xatpb->space ) + { + case XENMAPSPACE_dev_mmio: + /* res0 is reserved for future use. */ + if ( xatpb->u.res0 ) + return -EOPNOTSUPP; + break; + + case XENMAPSPACE_gmfn_foreign: + extra.foreign_domid = xatpb->u.foreign_domid; + break; + } + while ( xatpb->size > extent ) { xen_ulong_t idx; @@ -903,8 +934,7 @@ static int xenmem_add_to_physmap_batch(s extent, 1)) ) return -EFAULT; - rc = xenmem_add_to_physmap_one(d, xatpb->space, - xatpb->u, + rc = xenmem_add_to_physmap_one(d, xatpb->space, extra, idx, _gfn(gpfn)); if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) ) --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -592,8 +592,22 @@ void scrub_one_page(struct page_info *); page_list_del(pg, page_to_list(d, pg)) #endif +union add_to_physmap_extra { + /* + * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs + * to be kept until after the flush, so the page can't get removed from + * the domain (and re-used for another purpose) beforehand. By passing + * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants + * to have ownership of such a reference transferred in the success case. + */ + struct page_info **ppage; + + /* XENMAPSPACE_gmfn_foreign */ + domid_t foreign_domid; +}; + int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn); int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, ++++++ 5f8ed682-AMD-IOMMU-convert-amd_iommu_pte.patch ++++++ # Commit 73f62c7380edf07469581a3049aba98abd63b275 # Date 2020-10-20 14:22:26 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> AMD/IOMMU: convert amd_iommu_pte from struct to union This is to add a "raw" counterpart to the bitfield equivalent. Take the opportunity and - convert fields to bool / unsigned int, - drop the naming of the reserved field, - shorten the names of the ignored ones. This is part of XSA-347. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Paul Durrant <paul@xen.org> --- a/xen/drivers/passthrough/amd/iommu-defs.h +++ b/xen/drivers/passthrough/amd/iommu-defs.h @@ -451,20 +451,23 @@ union amd_iommu_x2apic_control { #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4) #define IOMMU_PAGE_TABLE_ALIGNMENT 4096 -struct amd_iommu_pte { - uint64_t pr:1; - uint64_t ignored0:4; - uint64_t a:1; - uint64_t d:1; - uint64_t ignored1:2; - uint64_t next_level:3; - uint64_t mfn:40; - uint64_t reserved:7; - uint64_t u:1; - uint64_t fc:1; - uint64_t ir:1; - uint64_t iw:1; - uint64_t ignored2:1; +union amd_iommu_pte { + uint64_t raw; + struct { + bool pr:1; + unsigned int ign0:4; + bool a:1; + bool d:1; + unsigned int ign1:2; + unsigned int next_level:3; + uint64_t mfn:40; + unsigned int :7; + bool u:1; + bool fc:1; + bool ir:1; + bool iw:1; + unsigned int ign2:1; + }; }; /* Paging modes */ --- a/xen/drivers/passthrough/amd/iommu_map.c +++ b/xen/drivers/passthrough/amd/iommu_map.c @@ -34,7 +34,7 @@ static unsigned int pfn_to_pde_idx(unsig static unsigned int clear_iommu_pte_present(unsigned long l1_mfn, unsigned long dfn) { - struct amd_iommu_pte *table, *pte; + union amd_iommu_pte *table, *pte; unsigned int flush_flags; table = map_domain_page(_mfn(l1_mfn)); @@ -48,7 +48,7 @@ static unsigned int clear_iommu_pte_pres return flush_flags; } -static unsigned int set_iommu_pde_present(struct amd_iommu_pte *pte, +static unsigned int set_iommu_pde_present(union amd_iommu_pte *pte, unsigned long next_mfn, unsigned int next_level, bool iw, bool ir) @@ -83,7 +83,7 @@ static unsigned int set_iommu_pte_presen int pde_level, bool iw, bool ir) { - struct amd_iommu_pte *table, *pde; + union amd_iommu_pte *table, *pde; unsigned int flush_flags; table = map_domain_page(_mfn(pt_mfn)); @@ -174,7 +174,7 @@ void iommu_dte_set_guest_cr3(struct amd_ static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn, unsigned long pt_mfn[], bool map) { - struct amd_iommu_pte *pde, *next_table_vaddr; + union amd_iommu_pte *pde, *next_table_vaddr; unsigned long next_table_mfn; unsigned int level; struct page_info *table; @@ -448,7 +448,7 @@ int __init amd_iommu_quarantine_init(str unsigned long end_gfn = 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); unsigned int level = amd_iommu_get_paging_mode(end_gfn); - struct amd_iommu_pte *table; + union amd_iommu_pte *table; if ( hd->arch.root_table ) { @@ -479,7 +479,7 @@ int __init amd_iommu_quarantine_init(str for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) { - struct amd_iommu_pte *pde = &table[i]; + union amd_iommu_pte *pde = &table[i]; /* * PDEs are essentially a subset of PTEs, so this function --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -387,7 +387,7 @@ static void deallocate_next_page_table(s static void deallocate_page_table(struct page_info *pg) { - struct amd_iommu_pte *table_vaddr; + union amd_iommu_pte *table_vaddr; unsigned int index, level = PFN_ORDER(pg); PFN_ORDER(pg) = 0; @@ -402,7 +402,7 @@ static void deallocate_page_table(struct for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) { - struct amd_iommu_pte *pde = &table_vaddr[index]; + union amd_iommu_pte *pde = &table_vaddr[index]; if ( pde->mfn && pde->next_level && pde->pr ) { @@ -554,7 +554,7 @@ static void amd_dump_p2m_table_level(str paddr_t gpa, int indent) { paddr_t address; - struct amd_iommu_pte *table_vaddr; + const union amd_iommu_pte *table_vaddr; int index; if ( level < 1 ) @@ -570,7 +570,7 @@ static void amd_dump_p2m_table_level(str for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) { - struct amd_iommu_pte *pde = &table_vaddr[index]; + const union amd_iommu_pte *pde = &table_vaddr[index]; if ( !(index % 2) ) process_pending_softirqs(); ++++++ 5f8ed69c-AMD-IOMMU-update-live-PTEs-atomically.patch ++++++ # Commit 3b055121c5410e2c3105d6d06aa24ca0d58868cd # Date 2020-10-20 14:22:52 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> AMD/IOMMU: update live PTEs atomically Updating a live PTE bitfield by bitfield risks the compiler re-ordering the individual updates as well as splitting individual updates into multiple memory writes. Construct the new entry fully in a local variable, do the check to determine the flushing needs on the thus established new entry, and then write the new entry by a single insn. Similarly using memset() to clear a PTE is unsafe, as the order of writes the function does is, at least in principle, undefined. This is part of XSA-347. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> --- a/xen/drivers/passthrough/amd/iommu_map.c +++ b/xen/drivers/passthrough/amd/iommu_map.c @@ -41,7 +41,7 @@ static unsigned int clear_iommu_pte_pres pte = &table[pfn_to_pde_idx(dfn, 1)]; flush_flags = pte->pr ? IOMMU_FLUSHF_modified : 0; - memset(pte, 0, sizeof(*pte)); + write_atomic(&pte->raw, 0); unmap_domain_page(table); @@ -53,26 +53,30 @@ static unsigned int set_iommu_pde_presen unsigned int next_level, bool iw, bool ir) { + union amd_iommu_pte new = {}, old; unsigned int flush_flags = IOMMU_FLUSHF_added; - if ( pte->pr && - (pte->mfn != next_mfn || - pte->iw != iw || - pte->ir != ir || - pte->next_level != next_level) ) - flush_flags |= IOMMU_FLUSHF_modified; - /* * FC bit should be enabled in PTE, this helps to solve potential * issues with ATS devices */ - pte->fc = !next_level; + new.fc = !next_level; + + new.mfn = next_mfn; + new.iw = iw; + new.ir = ir; + new.next_level = next_level; + new.pr = true; + + old.raw = read_atomic(&pte->raw); + old.ign0 = 0; + old.ign1 = 0; + old.ign2 = 0; + + if ( old.pr && old.raw != new.raw ) + flush_flags |= IOMMU_FLUSHF_modified; - pte->mfn = next_mfn; - pte->iw = iw; - pte->ir = ir; - pte->next_level = next_level; - pte->pr = 1; + write_atomic(&pte->raw, new.raw); return flush_flags; } ++++++ 5f8ed6b0-AMD-IOMMU-suitably-order-DTE-mods.patch ++++++ # Commit 0514a3a25fb9ebff5d75cc8f00a9229385300858 # Date 2020-10-20 14:23:12 +0200 # Author Jan Beulich <jbeulich@suse.com> # Committer Jan Beulich <jbeulich@suse.com> AMD/IOMMU: ensure suitable ordering of DTE modifications DMA and interrupt translation should be enabled only after other applicable DTE fields have been written. Similarly when disabling translation or when moving a device between domains, translation should first be disabled, before other entry fields get modified. Note however that the "moving" aspect doesn't apply to the interrupt remapping side, as domain specifics are maintained in the IRTEs here, not the DTE. We also never disable interrupt remapping once it got enabled for a device (the respective argument passed is always the immutable iommu_intremap). This is part of XSA-347. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> --- a/xen/drivers/passthrough/amd/iommu_map.c +++ b/xen/drivers/passthrough/amd/iommu_map.c @@ -103,11 +103,18 @@ void amd_iommu_set_root_page_table(struc uint64_t root_ptr, uint16_t domain_id, uint8_t paging_mode, bool valid) { + if ( valid || dte->v ) + { + dte->tv = false; + dte->v = true; + smp_wmb(); + } dte->domain_id = domain_id; dte->pt_root = paddr_to_pfn(root_ptr); dte->iw = true; dte->ir = true; dte->paging_mode = paging_mode; + smp_wmb(); dte->tv = true; dte->v = valid; } @@ -130,6 +137,7 @@ void amd_iommu_set_intremap_table( } dte->ig = false; /* unmapped interrupts result in i/o page faults */ + smp_wmb(); dte->iv = valid; } --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -117,7 +117,10 @@ static void amd_iommu_setup_domain_devic /* Undo what amd_iommu_disable_domain_device() may have done. */ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; if ( dte->it_root ) + { dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; + smp_wmb(); + } dte->iv = iommu_intremap; dte->ex = ivrs_dev->dte_allow_exclusion; dte->sys_mgt = MASK_EXTR(ivrs_dev->device_flags, ACPI_IVHD_SYSTEM_MGMT); ++++++ libxc.sr.superpage.patch ++++++ --- /var/tmp/diff_new_pack.AogIHC/_old 2020-11-02 09:38:45.557515843 +0100 +++ /var/tmp/diff_new_pack.AogIHC/_new 2020-11-02 09:38:45.557515843 +0100 @@ -482,7 +482,7 @@ free(ctx->x86.restore.cpuid.ptr); free(ctx->x86.restore.msr.ptr); -@@ -249,6 +277,318 @@ static int x86_hvm_cleanup(struct xc_sr_ +@@ -249,6 +277,322 @@ static int x86_hvm_cleanup(struct xc_sr_ return 0; } @@ -601,8 +601,10 @@ + sp->count = 1ULL << order; + + /* Allocate only if there is room for another page */ -+ if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) ++ if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) { ++ errno = E2BIG; + return false; ++ } + + extent = sp->base_pfn = (sp->pfn >> order) << order; + done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent); @@ -610,8 +612,10 @@ + PERROR("populate_physmap failed."); + return false; + } -+ if ( done == 0 ) ++ if ( done == 0 ) { ++ errno = ENOMEM; + return false; ++ } + + DPRINTF("4K base_pfn %" PRI_xen_pfn "\n", sp->base_pfn); + return true; ++++++ xsa286-1.patch ++++++ x86/mm: split L4 and L3 parts of the walk out of do_page_walk() The L3 one at least is going to be re-used by a subsequent patch, and splitting the L4 one then as well seems only natural. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vs l2_pgentry_t *compat_idle_pg_table_l2; -void *do_page_walk(struct vcpu *v, unsigned long addr) +static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; + unsigned long mfn = pagetable_get_pfn(root); + l4_pgentry_t *l4t, l4e; - if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) - return NULL; + if ( !is_canonical_address(addr) ) + return l4e_empty(); l4t = map_domain_page(_mfn(mfn)); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); + + return l4e; +} + +static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) +{ + l4_pgentry_t l4e = page_walk_get_l4e(root, addr); + l3_pgentry_t *l3t, l3e; + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return NULL; + return l3e_empty(); l3t = map_l3t_from_l4e(l4e); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); + + return l3e; +} + +void *do_page_walk(struct vcpu *v, unsigned long addr) +{ + l3_pgentry_t l3e; + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long mfn; + + if ( !is_pv_vcpu(v) ) + return NULL; + + l3e = page_walk_get_l3e(v->arch.guest_table, addr); mfn = l3e_get_pfn(l3e); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) return NULL; ++++++ xsa286-2.patch ++++++ x86/mm: check page types in do_page_walk() For page table entries read to be guaranteed valid, transiently locking the pages and validating their types is necessary. Note that guest use of linear page tables is intentionally not taken into account here, as ordinary data (guest stacks) can't possibly live inside page tables. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2; static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(root); - l4_pgentry_t *l4t, l4e; + mfn_t mfn = pagetable_get_mfn(root); + /* current's root page table can't disappear under our feet. */ + bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table)); + struct page_info *pg; + l4_pgentry_t l4e = l4e_empty(); if ( !is_canonical_address(addr) ) return l4e_empty(); - l4t = map_domain_page(_mfn(mfn)); - l4e = l4t[l4_table_offset(addr)]; - unmap_domain_page(l4t); + pg = mfn_to_page(mfn); + if ( need_lock && !page_lock(pg) ) + return l4e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table ) + { + l4_pgentry_t *l4t = map_domain_page(mfn); + + l4e = l4t[l4_table_offset(addr)]; + unmap_domain_page(l4t); + } + + if ( need_lock ) + page_unlock(pg); return l4e; } @@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pa static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) { l4_pgentry_t l4e = page_walk_get_l4e(root, addr); - l3_pgentry_t *l3t, l3e; + mfn_t mfn = l4e_get_mfn(l4e); + struct page_info *pg; + l3_pgentry_t l3e = l3e_empty(); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return l3e_empty(); - l3t = map_l3t_from_l4e(l4e); - l3e = l3t[l3_table_offset(addr)]; - unmap_domain_page(l3t); + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l3e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table ) + { + l3_pgentry_t *l3t = map_domain_page(mfn); + + l3e = l3t[l3_table_offset(addr)]; + unmap_domain_page(l3t); + } + + page_unlock(pg); return l3e; } @@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pa void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - unsigned long mfn; + l2_pgentry_t l2e = l2e_empty(); + l1_pgentry_t l1e = l1e_empty(); + mfn_t mfn; + struct page_info *pg; if ( !is_pv_vcpu(v) ) return NULL; l3e = page_walk_get_l3e(v->arch.guest_table, addr); - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + mfn = l3e_get_mfn(l3e); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1))); goto ret; } - l2t = map_domain_page(_mfn(mfn)); - l2e = l2t[l2_table_offset(addr)]; - unmap_domain_page(l2t); - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + const l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + mfn = l2e_get_mfn(l2e); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1))); goto ret; } - l1t = map_domain_page(_mfn(mfn)); - l1e = l1t[l1_table_offset(addr)]; - unmap_domain_page(l1t); - mfn = l1e_get_pfn(l1e); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + const l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + unmap_domain_page(l1t); + } + + page_unlock(pg); + + mfn = l1e_get_mfn(l1e); + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; ret: - return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK); + return map_domain_page(mfn) + (addr & ~PAGE_MASK); } /* ++++++ xsa286-3.patch ++++++ x86/mm: avoid using linear page tables in map_guest_l1e() Replace the linear L2 table access by an actual page walk. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/pv/mm.c +++ b/xen/arch/x86/pv/mm.c @@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned lon if ( unlikely(!__addr_ok(linear)) ) return NULL; - /* Find this l1e and its enclosing l1mfn in the linear map. */ - if ( __copy_from_user(&l2e, - &__linear_l2_table[l2_linear_offset(linear)], - sizeof(l2_pgentry_t)) ) + if ( unlikely(!(current->arch.flags & TF_kernel_mode)) ) + { + ASSERT_UNREACHABLE(); return NULL; + } + + /* Find this l1e and its enclosing l1mfn. */ + l2e = page_walk_get_l2e(current->arch.guest_table, linear); /* Check flags that it will be safe to read the l1e. */ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pa return l3e; } +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) +{ + l3_pgentry_t l3e = page_walk_get_l3e(root, addr); + mfn_t mfn = l3e_get_mfn(l3e); + struct page_info *pg; + l2_pgentry_t l2e = l2e_empty(); + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & _PAGE_PSE) ) + return l2e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l2e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + return l2e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -579,7 +579,9 @@ void audit_domains(void); void make_cr3(struct vcpu *v, mfn_t mfn); void update_cr3(struct vcpu *v); int vcpu_destroy_pagetables(struct vcpu *); + void *do_page_walk(struct vcpu *v, unsigned long addr); +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); /* Allocator functions for Xen pagetables. */ void *alloc_xen_pagetable(void); ++++++ xsa286-4.patch ++++++ x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e() First of all drop guest_get_eff_l1e() entirely - there's no actual user of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around its only call site. Then replace the linear L1 table access by an actual page walk. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/pv/mm.c +++ b/xen/arch/x86/pv/mm.c @@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned lon } /* - * Read the guest's l1e that maps this address, from the kernel-mode - * page tables. - */ -static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) -{ - struct vcpu *curr = current; - const bool user_mode = !(curr->arch.flags & TF_kernel_mode); - l1_pgentry_t l1e; - - if ( user_mode ) - toggle_guest_pt(curr); - - l1e = guest_get_eff_l1e(linear); - - if ( user_mode ) - toggle_guest_pt(curr); - - return l1e; -} - -/* * Map a guest's LDT page (covering the byte at @offset from start of the LDT) * into Xen's virtual range. Returns true if the mapping changed, false * otherwise. --- a/xen/arch/x86/pv/mm.h +++ b/xen/arch/x86/pv/mm.h @@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned lon int new_guest_cr3(mfn_t mfn); -/* Read a PV guest's l1e that maps this linear address. */ -static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear) +/* + * Read the guest's l1e that maps this address, from the kernel-mode + * page tables. + */ +static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) { - l1_pgentry_t l1e; + l1_pgentry_t l1e = l1e_empty(); ASSERT(!paging_mode_translate(current->domain)); ASSERT(!paging_mode_external(current->domain)); - if ( unlikely(!__addr_ok(linear)) || - __copy_from_user(&l1e, - &__linear_l1_table[l1_linear_offset(linear)], - sizeof(l1_pgentry_t)) ) - l1e = l1e_empty(); + if ( likely(__addr_ok(linear)) ) + l1e = page_walk_get_l1e(current->arch.guest_table, linear); return l1e; } --- a/xen/arch/x86/pv/ro-page-fault.c +++ b/xen/arch/x86/pv/ro-page-fault.c @@ -349,7 +349,7 @@ int pv_ro_page_fault(unsigned long addr, bool mmio_ro; /* Attempt to read the PTE that maps the VA being accessed. */ - pte = guest_get_eff_l1e(addr); + pte = guest_get_eff_kern_l1e(addr); /* We are only looking for read-only mappings */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable return l2e; } +/* + * For now no "set_accessed" parameter, as all callers want it set to true. + * For now also no "set_dirty" parameter, as all callers deal with r/o + * mappings, and we don't want to set the dirty bit there (conflicts with + * CET-SS). However, as there are CPUs which may set the dirty bit on r/o + * PTEs, the logic below tolerates the bit becoming set "behind our backs". + */ +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr) +{ + l2_pgentry_t l2e = page_walk_get_l2e(root, addr); + mfn_t mfn = l2e_get_mfn(l2e); + struct page_info *pg; + l1_pgentry_t l1e = l1e_empty(); + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + return l1e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l1e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + + if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) == + _PAGE_PRESENT ) + { + l1_pgentry_t ol1e = l1e; + + l1e_add_flags(l1e, _PAGE_ACCESSED); + /* + * Best effort only; with the lock held the page shouldn't + * change anyway, except for the dirty bit to perhaps become set. + */ + while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]), + l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) != + l1e_get_intpte(ol1e) && + !(l1e_get_flags(l1e) & _PAGE_DIRTY) ) + { + l1e_add_flags(ol1e, _PAGE_DIRTY); + l1e_add_flags(l1e, _PAGE_DIRTY); + } + } + + unmap_domain_page(l1t); + } + + page_unlock(pg); + + return l1e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -582,6 +582,7 @@ int vcpu_destroy_pagetables(struct vcpu void *do_page_walk(struct vcpu *v, unsigned long addr); l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr); /* Allocator functions for Xen pagetables. */ void *alloc_xen_pagetable(void); ++++++ xsa286-5.patch ++++++ x86/mm: avoid using top level linear page tables in {,un}map_domain_page() Move the page table recursion two levels down. This entails avoiding to free the recursive mapping prematurely in free_perdomain_mappings(). This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/domain_page.c +++ b/xen/arch/x86/domain_page.c @@ -65,7 +65,8 @@ void __init mapcache_override_current(st #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) #define MAPCACHE_L1ENT(idx) \ - __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] + ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \ + ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx] void *map_domain_page(mfn_t mfn) { @@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain * { struct mapcache_domain *dcache = &d->arch.pv.mapcache; unsigned int bitmap_pages; + int rc; ASSERT(is_pv_domain(d)); @@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain * return 0; #endif + BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + - 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) + + (1U << L2_PAGETABLE_SHIFT) > MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; @@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain * spin_lock_init(&dcache->lock); - return create_perdomain_mapping(d, (unsigned long)dcache->inuse, - 2 * bitmap_pages + 1, - NIL(l1_pgentry_t *), NULL); + rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, + 2 * bitmap_pages + 1, + NIL(l1_pgentry_t *), NULL); + if ( !rc ) + { + /* + * Install mapping of our L2 table into its own last slot, for easy + * access to the L1 entries via MAPCACHE_L1ENT(). + */ + l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg); + l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)]; + l2_pgentry_t *l2t = map_l2t_from_l3e(l3e); + + l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e); + unmap_domain_page(l2t); + unmap_domain_page(l3t); + } + + return rc; } int mapcache_vcpu_init(struct vcpu *v) @@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void else { ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - pl1e = &__linear_l1_table[l1_linear_offset(va)]; + pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START)); } return l1e_get_mfn(*pl1e); --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -6061,6 +6061,10 @@ void free_perdomain_mappings(struct doma { struct page_info *l1pg = l2e_get_page(l2tab[j]); + /* mapcache_domain_init() installs a recursive entry. */ + if ( l1pg == l2pg ) + continue; + if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); ++++++ xsa286-6.patch ++++++ x86/mm: restrict use of linear page tables to shadow mode code Other code does not require them to be set up anymore, so restrict when to populate the respective L4 slot and reduce visibility of the accessors. While with the removal of all uses the vulnerability is actually fixed, removing the creation of the linear mapping adds an extra layer of protection. Similarly reducing visibility of the accessors mostly eliminates the risk of undue re-introduction of uses of the linear mappings. This is (not strictly) part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1682,9 +1682,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; - /* Slot 258: Self linear mappings. */ + /* Slot 258: Self linear mappings (shadow pt only). */ ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = + !shadow_mode_external(d) ? l4e_empty() : l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); /* Slot 259: Shadow linear mappings (if applicable) .*/ --- a/xen/arch/x86/mm/shadow/private.h +++ b/xen/arch/x86/mm/shadow/private.h @@ -139,6 +139,15 @@ enum { # define GUEST_PTE_SIZE 4 #endif +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + /****************************************************************************** * Auditing routines */ --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -808,9 +808,6 @@ void __init paging_init(void) machine_to_phys_mapping_valid = 1; - /* Set up linear page table mapping. */ - l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], - l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW)); return; nomem: --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -199,7 +199,7 @@ extern unsigned char boot_edid_info[128] */ #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) -/* Slot 258: linear page table (guest table). */ +/* Slot 258: linear page table (monitor table, HVM only). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 259: linear page table (shadow table). */ --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -294,19 +294,6 @@ void copy_page_sse2(void *, const void * #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))) #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) -#endif /* !defined(__ASSEMBLY__) */ - -/* Where to find each level of the linear mapping */ -#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) - - -#ifndef __ASSEMBLY__ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; extern l2_pgentry_t *compat_idle_pg_table_l2; extern unsigned int m2p_compat_vstart;