# git rev-parse -q --verify 9d64bf433c53cab2f48a3fff7a1f2a696bc5229a^{commit} 9d64bf433c53cab2f48a3fff7a1f2a696bc5229a already have revision, skipping fetch # git checkout -q -f -B kisskb 9d64bf433c53cab2f48a3fff7a1f2a696bc5229a # git clean -qxdf # < git log -1 # commit 9d64bf433c53cab2f48a3fff7a1f2a696bc5229a # Merge: 57f22c8dab6b d988c9f511af # Author: Linus Torvalds # Date: Fri Jan 19 14:25:23 2024 -0800 # # Merge tag 'perf-tools-for-v6.8-1-2024-01-09' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools # # Pull perf tools updates from Arnaldo Carvalho de Melo: # "Add Namhyung Kim as tools/perf/ co-maintainer, we're taking turns # processing patches, switching roles from perf-tools to perf-tools-next # at each Linux release. # # Data profiling: # # - Associate samples that identify loads and stores with data # structures. This uses events available on Intel, AMD and others and # DWARF info: # # # To get memory access samples in kernel for 1 second (on Intel) # $ perf mem record -a -K --ldlat=4 -- sleep 1 # # # Similar for the AMD (but it requires 6.3+ kernel for BPF filters) # $ perf mem record -a --filter 'mem_op == load || mem_op == store, ip > 0x8000000000000000' -- sleep 1 # # Then, amongst several modes of post processing, one can do things like: # # $ perf report -s type,typeoff --hierarchy --group --stdio # ... # # # # Samples: 10K of events 'cpu/mem-loads,ldlat=4/P, cpu/mem-stores/P, dummy:u' # # Event count (approx.): 602758064 # # # # Overhead Data Type / Data Type Offset # # ........................... ............................ # # # 26.09% 3.28% 0.00% long unsigned int # 26.09% 3.28% 0.00% long unsigned int +0 (no field) # 18.48% 0.73% 0.00% struct page # 10.83% 0.02% 0.00% struct page +8 (lru.next) # 3.90% 0.28% 0.00% struct page +0 (flags) # 3.45% 0.06% 0.00% struct page +24 (mapping) # 0.25% 0.28% 0.00% struct page +48 (_mapcount.counter) # 0.02% 0.06% 0.00% struct page +32 (index) # 0.02% 0.00% 0.00% struct page +52 (_refcount.counter) # 0.02% 0.01% 0.00% struct page +56 (memcg_data) # 0.00% 0.01% 0.00% struct page +16 (lru.prev) # 15.37% 17.54% 0.00% (stack operation) # 15.37% 17.54% 0.00% (stack operation) +0 (no field) # 11.71% 50.27% 0.00% (unknown) # 11.71% 50.27% 0.00% (unknown) +0 (no field) # # $ perf annotate --data-type # ... # Annotate type: 'struct cfs_rq' in [kernel.kallsyms] (13 samples): # ============================================================================ # samples offset size field # 13 0 640 struct cfs_rq { # 2 0 16 struct load_weight load { # 2 0 8 unsigned long weight; # 0 8 4 u32 inv_weight; # }; # 0 16 8 unsigned long runnable_weight; # 0 24 4 unsigned int nr_running; # 1 28 4 unsigned int h_nr_running; # ... # # $ perf annotate --data-type=page --group # Annotate type: 'struct page' in [kernel.kallsyms] (480 samples): # event[0] = cpu/mem-loads,ldlat=4/P # event[1] = cpu/mem-stores/P # event[2] = dummy:u # =================================================================================== # samples offset size field # 447 33 0 0 64 struct page { # 108 8 0 0 8 long unsigned int flags; # 319 13 0 8 40 union { # 319 13 0 8 40 struct { # 236 2 0 8 16 union { # 236 2 0 8 16 struct list_head lru { # 236 1 0 8 8 struct list_head* next; # 0 1 0 16 8 struct list_head* prev; # }; # 236 2 0 8 16 struct { # 236 1 0 8 8 void* __filler; # 0 1 0 16 4 unsigned int mlock_count; # }; # 236 2 0 8 16 struct list_head buddy_list { # 236 1 0 8 8 struct list_head* next; # 0 1 0 16 8 struct list_head* prev; # }; # 236 2 0 8 16 struct list_head pcp_list { # 236 1 0 8 8 struct list_head* next; # 0 1 0 16 8 struct list_head* prev; # }; # }; # 82 4 0 24 8 struct address_space* mapping; # 1 7 0 32 8 union { # 1 7 0 32 8 long unsigned int index; # 1 7 0 32 8 long unsigned int share; # }; # 0 0 0 40 8 long unsigned int private; # }; # # This uses the existing annotate code, calling objdump to do the # disassembly, with improvements to avoid having this take too long, # but longer term a switch to a disassembler library, possibly # reusing code in the kernel will be pursued. # # This is the initial implementation, please use it and report # impressions and bugs. Make sure the kernel-debuginfo packages match # the running kernel. The 'perf report' phase for non short perf.data # files may take a while. # # There is a great article about it on LWN: # # https://lwn.net/Articles/955709/ - "Data-type profiling for perf" # # One last test I did while writing this text, on a AMD Ryzen 5950X, # using a distro kernel, while doing a simple 'find /' on an # otherwise idle system resulted in: # # # uname -r # 6.6.9-100.fc38.x86_64 # # perf -vv | grep BPF_ # bpf: [ on ] # HAVE_LIBBPF_SUPPORT # bpf_skeletons: [ on ] # HAVE_BPF_SKEL # # rpm -qa | grep kernel-debuginfo # kernel-debuginfo-common-x86_64-6.6.9-100.fc38.x86_64 # kernel-debuginfo-6.6.9-100.fc38.x86_64 # # # # perf mem record -a --filter 'mem_op == load || mem_op == store, ip > 0x8000000000000000' # ^C[ perf record: Woken up 1 times to write data ] # [ perf record: Captured and wrote 2.199 MB perf.data (2913 samples) ] # # # # ls -la perf.data # -rw-------. 1 root root 2346486 Jan 9 18:36 perf.data # # perf evlist # ibs_op// # dummy:u # # perf evlist -v # ibs_op//: type: 11, size: 136, config: 0, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT, read_format: ID, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1 # dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC|WEIGHT, read_format: ID, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 # # # # perf report -s type,typeoff --hierarchy --group --stdio # # Total Lost Samples: 0 # # # # Samples: 2K of events 'ibs_op//, dummy:u' # # Event count (approx.): 1904553038 # # # # Overhead Data Type / Data Type Offset # # ................... ............................ # # # 73.70% 0.00% (unknown) # 73.70% 0.00% (unknown) +0 (no field) # 3.01% 0.00% long unsigned int # 3.00% 0.00% long unsigned int +0 (no field) # 0.01% 0.00% long unsigned int +2 (no field) # 2.73% 0.00% struct task_struct # 1.71% 0.00% struct task_struct +52 (on_cpu) # 0.38% 0.00% struct task_struct +2104 (rcu_read_unlock_special.b.blocked) # 0.23% 0.00% struct task_struct +2100 (rcu_read_lock_nesting) # 0.14% 0.00% struct task_struct +2384 () # 0.06% 0.00% struct task_struct +3096 (signal) # 0.05% 0.00% struct task_struct +3616 (cgroups) # 0.05% 0.00% struct task_struct +2344 (active_mm) # 0.02% 0.00% struct task_struct +46 (flags) # 0.02% 0.00% struct task_struct +2096 (migration_disabled) # 0.01% 0.00% struct task_struct +24 (__state) # 0.01% 0.00% struct task_struct +3956 (mm_cid_active) # 0.01% 0.00% struct task_struct +1048 (cpus_ptr) # 0.01% 0.00% struct task_struct +184 (se.group_node.next) # 0.01% 0.00% struct task_struct +20 (thread_info.cpu) # 0.00% 0.00% struct task_struct +104 (on_rq) # 0.00% 0.00% struct task_struct +2456 (pid) # 1.36% 0.00% struct module # 0.59% 0.00% struct module +952 (kallsyms) # 0.42% 0.00% struct module +0 (state) # 0.23% 0.00% struct module +8 (list.next) # 0.12% 0.00% struct module +216 (syms) # 0.95% 0.00% struct inode # 0.41% 0.00% struct inode +40 (i_sb) # 0.22% 0.00% struct inode +0 (i_mode) # 0.06% 0.00% struct inode +76 (i_rdev) # 0.06% 0.00% struct inode +56 (i_security) # # # perf top/report: # # - Don't ignore job control, allowing control+Z + bg to work. # # - Add s390 raw data interpretation for PAI (Processor Activity # Instrumentation) counters. # # perf archive: # # - Add new option '--all' to pack perf.data with DSOs. # # - Add new option '--unpack' to expand tarballs. # # Initialization speedups: # # - Lazily initialize zstd streams to save memory when not using it. # # - Lazily allocate/size mmap event copy. # # - Lazy load kernel symbols in 'perf record'. # # - Be lazier in allocating lost samples buffer in 'perf record'. # # - Don't synthesize BPF events when disabled via the command line # (perf record --no-bpf-event). # # Assorted improvements: # # - Show note on AMD systems that the :p, :pp, :ppp and :P are all the # same, as IBS (Instruction Based Sampling) is used and it is # inherentely precise, not having levels of precision like in Intel # systems. # # - When 'cycles' isn't available, fall back to the "task-clock" event # when not system wide, not to 'cpu-clock'. # # - Add --debug-file option to redirect debug output, e.g.: # # $ perf --debug-file /tmp/perf.log record -v true # # - Shrink 'struct map' to under one cacheline by avoiding function # pointers for selecting if addresses are identity or DSO relative, # and using just a byte for some boolean struct members. # # - Resolve the arch specific strerrno just once to use in # perf_env__arch_strerrno(). # # - Reduce memory for recording PERF_RECORD_LOST_SAMPLES event. # # Assorted fixes: # # - Fix the default 'perf top' usage on Intel hybrid systems, now it # starts with a browser showing the number of samples for Efficiency # (cpu_atom/cycles/P) and Performance (cpu_core/cycles/P). This # behaviour is similar on ARM64, with its respective set of # big.LITTLE processors. # # - Fix segfault on build_mem_topology() error path. # # - Fix 'perf mem' error on hybrid related to availability of mem event # in a PMU. # # - Fix missing reference count gets (map, maps) in the db-export code. # # - Avoid recursively taking env->bpf_progs.lock in the 'perf_env' # code. # # - Use the newly introduced maps__for_each_map() to add missing # locking around iteration of 'struct map' entries. # # - Parse NOTE segments until the build id is found, don't stop on the # first one, ELF files may have several such NOTE segments. # # - Remove 'egrep' usage, its deprecated, use 'grep -E' instead. # # - Warn first about missing libelf, not libbpf, that depends on # libelf. # # - Use alternative to 'find ... -printf' as this isn't supported in # busybox. # # - Address python 3.6 DeprecationWarning for string scapes. # # - Fix memory leak in uniq() in libsubcmd. # # - Fix man page formatting for 'perf lock' # # - Fix some spelling mistakes. # # perf tests: # # - Fail shell tests that needs some symbol in perf itself if it is # stripped. These tests check if a symbol is resolved, if some hot # function is indeed detected by profiling, etc. # # - The 'perf test sigtrap' test is currently failing on PREEMPT_RT, # skip it if sleeping spinlocks are detected (using BTF) and point to # the mailing list discussion about it. This test is also being # skipped on several architectures (powerpc, s390x, arm and aarch64) # due to other pending issues with intruction breakpoints. # # - Adjust test case perf record offcpu profiling tests for s390. # # - Fix 'Setup struct perf_event_attr' fails on s390 on z/VM guest, # addressing issues caused by the fallback from cycles to task-clock # done in this release. # # - Fix mask for VG register in the user-regs test. # # - Use shellcheck on 'perf test' shell scripts automatically to make # sure changes don't introduce things it flags as problematic. # # - Add option to change objdump binary and allow it to be set via # 'perf config'. # # - Add basic 'perf script', 'perf list --json" and 'perf diff' tests. # # - Basic branch counter support. # # - Make DSO tests a suite rather than individual. # # - Remove atomics from test_loop to avoid test failures. # # - Fix call chain match on powerpc for the record+probe_libc_inet_pton # test. # # - Improve Intel hybrid tests. # # Vendor event files (JSON): # # powerpc: # # - Update datasource event name to fix duplicate events on IBM's # Power10. # # - Add PVN for HX-C2000 CPU with Power8 Architecture. # # Intel: # # - Alderlake/rocketlake metric fixes. # # - Update emeraldrapids events to v1.02. # # - Update icelakex events to v1.23. # # - Update sapphirerapids events to v1.17. # # - Add skx, clx, icx and spr upi bandwidth metric. # # AMD: # # - Add Zen 4 memory controller events. # # RISC-V: # # - Add StarFive Dubhe-80 and Dubhe-90 JSON files. # https://www.starfivetech.com/en/site/cpu-u # # - Add T-HEAD C9xx JSON file. # https://github.com/riscv-software-src/opensbi/blob/master/docs/platform/thead-c9xx.md # # ARM64: # # - Remove UTF-8 characters from cmn.json, that were causing build # failure in some distros. # # - Add core PMU events and metrics for Ampere One X. # # - Rename Ampere One's BPU_FLUSH_MEM_FAULT to GPC_FLUSH_MEM_FAULT # # libperf: # # - Rename several perf_cpu_map constructor names to clarify what they # really do. # # - Ditto for some other methods, coping with some issues in their # semantics, like perf_cpu_map__empty() -> # perf_cpu_map__has_any_cpu_or_is_empty(). # # - Document perf_cpu_map__nr()'s behavior # # perf stat: # # - Exit if parse groups fails. # # - Combine the -A/--no-aggr and --no-merge options. # # - Fix help message for --metric-no-threshold option. # # Hardware tracing: # # ARM64 CoreSight: # # - Bump minimum OpenCSD version to ensure a bugfix is present. # # - Add 'T' itrace option for timestamp trace # # - Set start vm addr of exectable file to 0 and don't ignore first # sample on the arm-cs-trace-disasm.py 'perf script'" # # * tag 'perf-tools-for-v6.8-1-2024-01-09' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (179 commits) # MAINTAINERS: Add Namhyung as tools/perf/ co-maintainer # perf test: test case 'Setup struct perf_event_attr' fails on s390 on z/vm # perf db-export: Fix missing reference count get in call_path_from_sample() # perf tests: Add perf script test # libsubcmd: Fix memory leak in uniq() # perf TUI: Don't ignore job control # perf vendor events intel: Update sapphirerapids events to v1.17 # perf vendor events intel: Update icelakex events to v1.23 # perf vendor events intel: Update emeraldrapids events to v1.02 # perf vendor events intel: Alderlake/rocketlake metric fixes # perf x86 test: Add hybrid test for conflicting legacy/sysfs event # perf x86 test: Update hybrid expectations # perf vendor events amd: Add Zen 4 memory controller events # perf stat: Fix hard coded LL miss units # perf record: Reduce memory for recording PERF_RECORD_LOST_SAMPLES event # perf env: Avoid recursively taking env->bpf_progs.lock # perf annotate: Add --insn-stat option for debugging # perf annotate: Add --type-stat option for debugging # perf annotate: Support event group display # perf annotate: Add --data-type option # ... # < /opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux-gcc --version # < /opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux-ld --version # < git log --format=%s --max-count=1 9d64bf433c53cab2f48a3fff7a1f2a696bc5229a # make -s -j 40 ARCH=x86 O=/kisskb/build/linus_x86_64_defconfig_x86_64-gcc8 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux- x86_64_defconfig # < make -s -j 40 ARCH=x86 O=/kisskb/build/linus_x86_64_defconfig_x86_64-gcc8 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux- help # make -s -j 40 ARCH=x86 O=/kisskb/build/linus_x86_64_defconfig_x86_64-gcc8 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux- olddefconfig # make -s -j 40 ARCH=x86 O=/kisskb/build/linus_x86_64_defconfig_x86_64-gcc8 CROSS_COMPILE=/opt/cross/kisskb/korg/gcc-8.5.0-nolibc/x86_64-linux/bin/x86_64-linux- Completed OK # rm -rf /kisskb/build/linus_x86_64_defconfig_x86_64-gcc8 # Build took: 0:01:33.403210