# git rev-parse -q --verify 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6^{commit} 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 already have revision, skipping fetch # git checkout -q -f -B kisskb 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # git clean -qxdf # < git log -1 # commit 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # Author: Linus Torvalds # Date: Thu Feb 17 08:57:47 2022 -0800 # # mm: don't try to NUMA-migrate COW pages that have other uses # # Oded Gabbay reports that enabling NUMA balancing causes corruption with # his Gaudi accelerator test load: # # "All the details are in the bug, but the bottom line is that somehow, # this patch causes corruption when the numa balancing feature is # enabled AND we don't use process affinity AND we use GUP to pin pages # so our accelerator can DMA to/from system memory. # # Either disabling numa balancing, using process affinity to bind to # specific numa-node or reverting this patch causes the bug to # disappear" # # and Oded bisected the issue to commit 09854ba94c6a ("mm: do_wp_page() # simplification"). # # Now, the NUMA balancing shouldn't actually be changing the writability # of a page, and as such shouldn't matter for COW. But it appears it # does. Suspicious. # # However, regardless of that, the condition for enabling NUMA faults in # change_pte_range() is nonsensical. It uses "page_mapcount(page)" to # decide if a COW page should be NUMA-protected or not, and that makes # absolutely no sense. # # The number of mappings a page has is irrelevant: not only does GUP get a # reference to a page as in Oded's case, but the other mappings migth be # paged out and the only reference to them would be in the page count. # # Since we should never try to NUMA-balance a page that we can't move # anyway due to other references, just fix the code to use 'page_count()'. # Oded confirms that that fixes his issue. # # Now, this does imply that something in NUMA balancing ends up changing # page protections (other than the obvious one of making the page # inaccessible to get the NUMA faulting information). Otherwise the COW # simplification wouldn't matter - since doing the GUP on the page would # make sure it's writable. # # The cause of that permission change would be good to figure out too, # since it clearly results in spurious COW events - but fixing the # nonsensical test that just happened to work before is obviously the # CorrectThing(tm) to do regardless. # # Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") # Link: https://bugzilla.kernel.org/show_bug.cgi?id=215616 # Link: https://lore.kernel.org/all/CAFCwf10eNmwq2wD71xjUhqkvv5+_pJMR1nPug2RqNDcFT4H86Q@mail.gmail.com/ # Reported-and-tested-by: Oded Gabbay # Cc: David Hildenbrand # Cc: Peter Xu # Signed-off-by: Linus Torvalds # < /opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux-gcc --version # < /opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux-ld --version # < git log --format=%s --max-count=1 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # < make -s -j 120 ARCH=um O=/kisskb/build/linus_um-allyesconfig_um-x86_64 CROSS_COMPILE=/opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux- SUBARCH=x86_64 allyesconfig # Added to kconfig CONFIG_STANDALONE=y # Added to kconfig CONFIG_KCOV=n # Added to kconfig CONFIG_GCC_PLUGINS=n # Added to kconfig CONFIG_GCC_PLUGIN_CYC_COMPLEXITY=n # Added to kconfig CONFIG_GCC_PLUGIN_SANCOV=n # Added to kconfig CONFIG_GCC_PLUGIN_LATENT_ENTROPY=n # Added to kconfig CONFIG_GCC_PLUGIN_STRUCTLEAK=n # Added to kconfig CONFIG_GCC_PLUGIN_RANDSTRUCT=n # Added to kconfig CONFIG_UML_NET=n # Added to kconfig CONFIG_UML_NET_ETHERTAP=n # Added to kconfig CONFIG_UML_NET_TUNTAP=n # Added to kconfig CONFIG_UML_NET_SLIP=n # Added to kconfig CONFIG_UML_NET_DAEMON=n # Added to kconfig CONFIG_UML_NET_VDE=n # Added to kconfig CONFIG_UML_NET_MCAST=n # Added to kconfig CONFIG_UML_NET_PCAP=n # Added to kconfig CONFIG_UML_NET_SLIRP=n # Added to kconfig CONFIG_GCOV_KERNEL=n # Added to kconfig CONFIG_DEBUG_INFO_BTF=n # Added to kconfig CONFIG_BPF_PRELOAD=n # < make -s -j 120 ARCH=um O=/kisskb/build/linus_um-allyesconfig_um-x86_64 CROSS_COMPILE=/opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux- SUBARCH=x86_64 help # make -s -j 120 ARCH=um O=/kisskb/build/linus_um-allyesconfig_um-x86_64 CROSS_COMPILE=/opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux- SUBARCH=x86_64 olddefconfig .config:12230:warning: override: reassigning to symbol GCC_PLUGIN_SANCOV .config:12233:warning: override: reassigning to symbol GCC_PLUGIN_RANDSTRUCT .config:12235:warning: override: reassigning to symbol UML_NET_ETHERTAP .config:12237:warning: override: reassigning to symbol UML_NET_SLIP .config:12240:warning: override: reassigning to symbol UML_NET_MCAST .config:12242:warning: override: reassigning to symbol UML_NET_SLIRP # make -s -j 120 ARCH=um O=/kisskb/build/linus_um-allyesconfig_um-x86_64 CROSS_COMPILE=/opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux- SUBARCH=x86_64 /kisskb/src/drivers/vfio/pci/vfio_pci_rdwr.c: In function 'vfio_pci_vga_rw': /kisskb/src/drivers/vfio/pci/vfio_pci_rdwr.c:317:11: error: implicit declaration of function 'ioport_map' [-Werror=implicit-function-declaration] iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1); ^ /kisskb/src/drivers/vfio/pci/vfio_pci_rdwr.c:317:9: error: assignment makes pointer from integer without a cast [-Werror=int-conversion] iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1); ^ /kisskb/src/drivers/vfio/pci/vfio_pci_rdwr.c:324:9: error: assignment makes pointer from integer without a cast [-Werror=int-conversion] iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1); ^ /kisskb/src/drivers/vfio/pci/vfio_pci_rdwr.c:338:15: error: implicit declaration of function 'ioport_unmap' [-Werror=implicit-function-declaration] is_ioport ? ioport_unmap(iomem) : iounmap(iomem); ^ cc1: all warnings being treated as errors make[4]: *** [/kisskb/src/scripts/Makefile.build:288: drivers/vfio/pci/vfio_pci_rdwr.o] Error 1 make[4]: *** Waiting for unfinished jobs.... make[3]: *** [/kisskb/src/scripts/Makefile.build:550: drivers/vfio/pci] Error 2 make[2]: *** [/kisskb/src/scripts/Makefile.build:550: drivers/vfio] Error 2 make[2]: *** Waiting for unfinished jobs.... In file included from /kisskb/src/arch/x86/um/asm/processor.h:41:0, from /kisskb/src/include/linux/mutex.h:19, from /kisskb/src/include/linux/kernfs.h:11, from /kisskb/src/include/linux/sysfs.h:16, from /kisskb/src/include/linux/kobject.h:20, from /kisskb/src/include/linux/pci.h:35, from /kisskb/src/drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:25: /kisskb/src/drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c: In function 'kfd_cpumask_to_apic_id': /kisskb/src/arch/um/include/asm/processor-generic.h:103:18: error: called object is not a function or function pointer #define cpu_data (&boot_cpu_data) ^ /kisskb/src/drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1556:9: note: in expansion of macro 'cpu_data' return cpu_data(first_cpu_of_numa_node).apicid; ^ /kisskb/src/drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1560:1: error: control reaches end of non-void function [-Werror=return-type] } ^ cc1: all warnings being treated as errors make[5]: *** [/kisskb/src/scripts/Makefile.build:288: drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.o] Error 1 make[5]: *** Waiting for unfinished jobs.... make[4]: *** [/kisskb/src/scripts/Makefile.build:550: drivers/gpu/drm/amd/amdgpu] Error 2 make[4]: *** Waiting for unfinished jobs.... make[3]: *** [/kisskb/src/scripts/Makefile.build:550: drivers/gpu/drm] Error 2 make[2]: *** [/kisskb/src/scripts/Makefile.build:550: drivers/gpu] Error 2 make[1]: *** [/kisskb/src/Makefile:1831: drivers] Error 2 make: *** [Makefile:219: __sub-make] Error 2 Command 'make -s -j 120 ARCH=um O=/kisskb/build/linus_um-allyesconfig_um-x86_64 CROSS_COMPILE=/opt/cross/kisskb/fe-x86-64-core-i7-2017.05/bin/x86_64-linux- SUBARCH=x86_64 ' returned non-zero exit status 2 # rm -rf /kisskb/build/linus_um-allyesconfig_um-x86_64 # Build took: 0:02:59.818627