# git rev-parse -q --verify 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6^{commit} 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 already have revision, skipping fetch # git checkout -q -f -B kisskb 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # git clean -qxdf # < git log -1 # commit 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # Author: Linus Torvalds # Date: Thu Feb 17 08:57:47 2022 -0800 # # mm: don't try to NUMA-migrate COW pages that have other uses # # Oded Gabbay reports that enabling NUMA balancing causes corruption with # his Gaudi accelerator test load: # # "All the details are in the bug, but the bottom line is that somehow, # this patch causes corruption when the numa balancing feature is # enabled AND we don't use process affinity AND we use GUP to pin pages # so our accelerator can DMA to/from system memory. # # Either disabling numa balancing, using process affinity to bind to # specific numa-node or reverting this patch causes the bug to # disappear" # # and Oded bisected the issue to commit 09854ba94c6a ("mm: do_wp_page() # simplification"). # # Now, the NUMA balancing shouldn't actually be changing the writability # of a page, and as such shouldn't matter for COW. But it appears it # does. Suspicious. # # However, regardless of that, the condition for enabling NUMA faults in # change_pte_range() is nonsensical. It uses "page_mapcount(page)" to # decide if a COW page should be NUMA-protected or not, and that makes # absolutely no sense. # # The number of mappings a page has is irrelevant: not only does GUP get a # reference to a page as in Oded's case, but the other mappings migth be # paged out and the only reference to them would be in the page count. # # Since we should never try to NUMA-balance a page that we can't move # anyway due to other references, just fix the code to use 'page_count()'. # Oded confirms that that fixes his issue. # # Now, this does imply that something in NUMA balancing ends up changing # page protections (other than the obvious one of making the page # inaccessible to get the NUMA faulting information). Otherwise the COW # simplification wouldn't matter - since doing the GUP on the page would # make sure it's writable. # # The cause of that permission change would be good to figure out too, # since it clearly results in spurious COW events - but fixing the # nonsensical test that just happened to work before is obviously the # CorrectThing(tm) to do regardless. # # Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") # Link: https://bugzilla.kernel.org/show_bug.cgi?id=215616 # Link: https://lore.kernel.org/all/CAFCwf10eNmwq2wD71xjUhqkvv5+_pJMR1nPug2RqNDcFT4H86Q@mail.gmail.com/ # Reported-and-tested-by: Oded Gabbay # Cc: David Hildenbrand # Cc: Peter Xu # Signed-off-by: Linus Torvalds # < /opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux-gcc --version # < /opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux-ld --version # < git log --format=%s --max-count=1 80d47f5de5e311cbc0d01ebb6ee684e8f4c196c6 # < make -s -j 48 ARCH=sparc O=/kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux- allmodconfig # Added to kconfig CONFIG_64BIT=n # Added to kconfig CONFIG_BUILD_DOCSRC=n # Added to kconfig CONFIG_HAVE_FTRACE_MCOUNT_RECORD=n # Added to kconfig CONFIG_SAMPLES=n # Added to kconfig CONFIG_MODULE_SIG=n # < make -s -j 48 ARCH=sparc O=/kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux- help # make -s -j 48 ARCH=sparc O=/kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux- olddefconfig # make -s -j 48 ARCH=sparc O=/kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux- :1517:2: warning: #warning syscall clone3 not implemented [-Wcpp] /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_page_for_dma': /kisskb/src/arch/sparc/mm/srmmu.c:1639:13: error: cast between incompatible function types from 'void (*)(long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1639 | xc1((smpfunc_t) local_ops->page_for_dma, page); | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_cache_mm': /kisskb/src/arch/sparc/mm/srmmu.c:1662:29: error: cast between incompatible function types from 'void (*)(struct mm_struct *)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1662 | xc1((smpfunc_t) local_ops->cache_mm, (unsigned long) mm); | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_tlb_mm': /kisskb/src/arch/sparc/mm/srmmu.c:1674:29: error: cast between incompatible function types from 'void (*)(struct mm_struct *)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1674 | xc1((smpfunc_t) local_ops->tlb_mm, (unsigned long) mm); | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_cache_range': /kisskb/src/arch/sparc/mm/srmmu.c:1694:29: error: cast between incompatible function types from 'void (*)(struct vm_area_struct *, long unsigned int, long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1694 | xc3((smpfunc_t) local_ops->cache_range, | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_tlb_range': /kisskb/src/arch/sparc/mm/srmmu.c:1711:29: error: cast between incompatible function types from 'void (*)(struct vm_area_struct *, long unsigned int, long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1711 | xc3((smpfunc_t) local_ops->tlb_range, | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_cache_page': /kisskb/src/arch/sparc/mm/srmmu.c:1726:29: error: cast between incompatible function types from 'void (*)(struct vm_area_struct *, long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1726 | xc2((smpfunc_t) local_ops->cache_page, | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_tlb_page': /kisskb/src/arch/sparc/mm/srmmu.c:1741:29: error: cast between incompatible function types from 'void (*)(struct vm_area_struct *, long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1741 | xc2((smpfunc_t) local_ops->tlb_page, | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_page_to_ram': /kisskb/src/arch/sparc/mm/srmmu.c:1756:13: error: cast between incompatible function types from 'void (*)(long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1756 | xc1((smpfunc_t) local_ops->page_to_ram, page); | ^ /kisskb/src/arch/sparc/mm/srmmu.c: In function 'smp_flush_sig_insns': /kisskb/src/arch/sparc/mm/srmmu.c:1767:21: error: cast between incompatible function types from 'void (*)(struct mm_struct *, long unsigned int)' to 'void (*)(long unsigned int, long unsigned int, long unsigned int, long unsigned int, long unsigned int)' [-Werror=cast-function-type] 1767 | xc2((smpfunc_t) local_ops->sig_insns, | ^ cc1: all warnings being treated as errors make[3]: *** [/kisskb/src/scripts/Makefile.build:288: arch/sparc/mm/srmmu.o] Error 1 make[3]: *** Waiting for unfinished jobs.... make[2]: *** [/kisskb/src/scripts/Makefile.build:550: arch/sparc/mm] Error 2 make[2]: *** Waiting for unfinished jobs.... make[1]: *** [/kisskb/src/Makefile:1831: arch/sparc] Error 2 make[1]: *** Waiting for unfinished jobs.... /kisskb/src/crypto/blake2b_generic.c: In function 'blake2b_compress_one_generic': /kisskb/src/crypto/blake2b_generic.c:109:1: error: the frame size of 2288 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] 109 | } | ^ cc1: all warnings being treated as errors make[2]: *** [/kisskb/src/scripts/Makefile.build:288: crypto/blake2b_generic.o] Error 1 make[1]: *** [/kisskb/src/Makefile:1831: crypto] Error 2 make: *** [Makefile:219: __sub-make] Error 2 Command 'make -s -j 48 ARCH=sparc O=/kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-11.1.0-nolibc/sparc64-linux/bin/sparc64-linux- ' returned non-zero exit status 2 # rm -rf /kisskb/build/linus_sparc-allmodconfig_sparc64-gcc11 # Build took: 0:15:50.540065