Skip to content

Commit

Permalink
utilities: Add upcall statistics to the kernel_delay.py script.
Browse files Browse the repository at this point in the history
This patch installs a kernel return probe on ovs_dp_upcall() to record
all successful and failed calls per CPU. The statistics are reported
when the script exits, providing insights into the upcall behavior.

This is an example output:

  # UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):
    Total upcalls       : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...]
    Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...]
    Failed upcalls      : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]
       11, EAGAIN       : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]

Tested-by: Adrian Moreno <amoreno@redhat.com>
Reviewed-by: Adrian Moreno <amoreno@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
  • Loading branch information
chaudron committed Dec 20, 2024
1 parent a48b327 commit 638433b
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 1 deletion.
1 change: 1 addition & 0 deletions utilities/automake.mk
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \
utilities/ovs-pipegen.py \
utilities/usdt-scripts/dpif_op_nl_monitor.py \
utilities/usdt-scripts/flow_reval_monitor.py \
utilities/usdt-scripts/kernel_delay.py \
utilities/usdt-scripts/upcall_monitor.py \
utilities/usdt-scripts/upcall_cost.py

Expand Down
136 changes: 135 additions & 1 deletion utilities/usdt-scripts/kernel_delay.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#
import argparse
import datetime
import errno
import os
import pytz
import psutil
Expand Down Expand Up @@ -556,6 +557,36 @@
data->start_ns = 0;
return 0;
}
/*
* For measuring upcall statistics (per CPU).
*/
BPF_PERCPU_HASH(upcall_count);
#if <INSTALL_OVS_DP_UPCALL_PROBE>
int kretprobe__ovs_dp_upcall(struct pt_regs *ctx)
{
int ret = PT_REGS_RC(ctx);
u64 zero = 0;
u64 *entry;
u64 key;
if (!capture_enabled__())
return 0;
if (ret >= 0)
key = 0;
else
key = -ret;
entry = upcall_count.lookup_or_try_init(&key, &zero);
if (entry)
*entry += 1;
return 0;
}
#endif /* For measuring upcall statistics/errors. */
"""


Expand Down Expand Up @@ -887,6 +918,7 @@ def reset_capture():
bpf["stack_traces"].clear()
bpf["stop_start"].clear()
bpf["stop_data"].clear()
bpf["upcall_count"].clear()


#
Expand All @@ -900,6 +932,97 @@ def print_timestamp(msg):
print(time_string)


#
# Get errno short string
#
def get_errno_short(err):
try:
return errno.errorcode[err]
except KeyError:
return "_unknown_"


#
# Format a eBPF per-cpu hash entry (if the count is > 0)
#
def format_per_cpu_hash(cpu_hash, key=None, skip_key=None):
per_cpu = ""

if key is not None:
total = cpu_hash.sum(key).value
if total > 0:
for cpu, value in enumerate(cpu_hash.getvalue(key)):
if value == 0:
continue

per_cpu += " {}: {},".format(cpu, value)
else:
total = 0
total_cpu = None

for key in cpu_hash.keys():
if skip_key is not None and skip_key.value == key.value:
continue

if total_cpu is None:
total_cpu = [0] * len(cpu_hash.getvalue(key))

for cpu, value in enumerate(cpu_hash.getvalue(key)):
total_cpu[cpu] += value
total += value

if total >= 0 and total_cpu:
for cpu, value in enumerate(total_cpu):
if value == 0:
continue

per_cpu += " {}: {},".format(cpu, value)

return total, per_cpu.strip(", ")


#
# Display kernel upcall statistics
#
def display_upcall_results():
upcalls = bpf["upcall_count"]
have_upcalls = False

for k in upcalls:
if upcalls.sum(k).value == 0:
continue
have_upcalls = True
break

if not have_upcalls:
return

print("\n\n# UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):\n"
" Total upcalls : {} [{}]".format(
*format_per_cpu_hash(upcalls)))

for k in sorted(upcalls, key=lambda x: int(x.value)):
error = k.value
total, per_cpu = format_per_cpu_hash(upcalls, key=k)

if error != 0 and total == 0:
continue

if error == 0:
total_failed, per_cpu_failed = format_per_cpu_hash(upcalls,
skip_key=k)
if total_failed == 0:
continue

print(" Successfull upcalls : {} [{}]".format(total, per_cpu))
print(" Failed upcalls : {} [{}]".format(total_failed,
per_cpu_failed))
else:
print(" {:3}, {:13}: {} [{}]".format(error,
get_errno_short(error),
total, per_cpu))


#
# process_results()
#
Expand Down Expand Up @@ -1074,7 +1197,12 @@ def process_results(syscall_events=None, trigger_delta=None):
indent, "TOTAL:", "", total_count, total_ns))

#
# Print events
# Print upcall statistics
#
display_upcall_results()

#
# Print syscall events
#
lost_stack_traces = 0
if syscall_events:
Expand Down Expand Up @@ -1194,6 +1322,9 @@ def main():
parser.add_argument("--skip-syscall-poll-events",
help="Skip poll() syscalls with --syscall-events",
action="store_true")
parser.add_argument("--skip-upcall-stats",
help="Skip the collection of upcall statistics",
action="store_true")
parser.add_argument("--stack-trace-size",
help="Number of unique stack traces that can be "
"recorded, default 4096. 0 to disable",
Expand Down Expand Up @@ -1298,6 +1429,9 @@ def main():
source = source.replace("<STACK_TRACE_ENABLED>", "true"
if options.stack_trace_size > 0 else "false")

source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
if options.skip_upcall_stats else "1")

#
# Handle start/stop probes
#
Expand Down
9 changes: 9 additions & 0 deletions utilities/usdt-scripts/kernel_delay.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ followed by resource-specific data. Which are:
- ``THREAD STOPPED STATISTICS``
- ``HARD IRQ STATISTICS``
- ``SOFT IRQ STATISTICS``
- ``UPCALL STATISTICS``

The following sections will describe in detail what statistics they report.

Expand Down Expand Up @@ -187,6 +188,14 @@ number of interrupts (``COUNT``), the total time spent in the interrupt
handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).


``UPCALL STATISTICS``
~~~~~~~~~~~~~~~~~~~~~
The ``UPCALL STATISTICS`` section provides information on the number of
upcalls sent by the kernel to userspace. If any upcalls fail to be sent,
the specific return codes are recorded. Statistics are presented both as
a total and on a per-CPU basis.


The ``--syscall-events`` option
-------------------------------
In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
Expand Down

0 comments on commit 638433b

Please sign in to comment.