From 1d9b04e8859cedf2c24880ea76632353a04706cd Mon Sep 17 00:00:00 2001 From: nokyan Date: Sun, 15 Dec 2024 09:58:32 +0100 Subject: [PATCH] Add debug and trace prints for resources-processes --- Cargo.lock | 2 + lib/process_data/Cargo.toml | 2 + lib/process_data/src/lib.rs | 117 +++++++++++++++++++++++++++------ src/bin/resources-processes.rs | 11 ++++ src/utils/gpu/nvidia.rs | 12 +--- src/utils/process.rs | 17 ++++- 6 files changed, 128 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e946152c..8f347de9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1278,9 +1278,11 @@ dependencies = [ "glob", "lazy-regex", "libc", + "log", "num_cpus", "nutype", "nvml-wrapper", + "pretty_env_logger", "serde", "syscalls", "sysconf", diff --git a/lib/process_data/Cargo.toml b/lib/process_data/Cargo.toml index 52489eb8..9de3bccb 100644 --- a/lib/process_data/Cargo.toml +++ b/lib/process_data/Cargo.toml @@ -21,9 +21,11 @@ anyhow = "1.0.94" glob = "0.3.1" lazy-regex = "3.3.0" libc = "0.2.167" +log = "0.4.22" num_cpus = "1.16.0" nutype = { version = "0.5.0", features = ["serde"] } nvml-wrapper = "0.10.0" +pretty_env_logger = "0.5" serde = { version = "1.0.215", features = ["serde_derive"] } syscalls = { version = "0.6.18", features = ["all"] } sysconf = "0.3.4" diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index 506a3590..e2ffbf9c 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -3,6 +3,7 @@ pub mod pci_slot; use anyhow::{bail, Context, Result}; use glob::glob; use lazy_regex::{lazy_regex, Lazy, Regex}; +use log::{debug, trace, warn}; use nutype::nutype; use nvml_wrapper::enums::device::UsedGpuMemory; use nvml_wrapper::error::NvmlError; @@ -27,10 +28,18 @@ const STAT_STARTTIME: usize = 21 - STAT_OFFSET; const GPU_DRIVER_NAMES: &[&str] = &["amdgpu", "i915"]; const NPU_DRIVER_NAMES: &[&str] = &["amdxdna_accel_driver"]; +const MAJOR: u32 = 226; + static USERS_CACHE: LazyLock> = LazyLock::new(|| unsafe { - uzers::all_users() - .map(|user| (user.uid(), user.name().to_string_lossy().to_string())) - .collect() + debug!("Initializing users cache…"); + let users: HashMap = uzers::all_users() + .map(|user| { + trace!("Found user {}", user.name().to_string_lossy()); + (user.uid(), user.name().to_string_lossy().to_string()) + }) + .collect(); + debug!("Found {} users", users.len()); + users }); static PAGESIZE: LazyLock = LazyLock::new(sysconf::pagesize); @@ -82,16 +91,25 @@ static RE_DRM_ENGINE_VIDEO: Lazy = lazy_regex!(r"drm-engine-video:\s*(\d+ static RE_DRM_TOTAL_MEMORY: Lazy = lazy_regex!(r"drm-total-memory:\s*(\d+)\s*KiB"); -static NVML: Lazy> = Lazy::new(Nvml::init); +static NVML: Lazy> = Lazy::new(|| { + debug!("Initializing connection to NVML…"); + Nvml::init().inspect_err(|err| warn!("Unable to connect to NVML: {err}")) +}); static NVML_DEVICES: Lazy> = Lazy::new(|| { if let Ok(nvml) = NVML.as_ref() { + debug!("Looking for NVIDIA devices…"); let device_count = nvml.device_count().unwrap_or(0); let mut return_vec = Vec::with_capacity(device_count as usize); for i in 0..device_count { if let Ok(gpu) = nvml.device_by_index(i) { if let Ok(pci_slot) = gpu.pci_info().map(|pci_info| pci_info.bus_id) { let pci_slot = PciSlot::from_str(&pci_slot).unwrap(); + debug!( + "Found {} at {}", + gpu.name().unwrap_or("N/A".into()), + pci_slot + ); return_vec.push((pci_slot, gpu)); } } @@ -253,13 +271,6 @@ impl ProcessData { pub fn try_from_path>(proc_path: P) -> Result { let proc_path = proc_path.as_ref(); - let stat = std::fs::read_to_string(proc_path.join("stat"))?; - let statm = std::fs::read_to_string(proc_path.join("statm"))?; - let status = std::fs::read_to_string(proc_path.join("status"))?; - let comm = std::fs::read_to_string(proc_path.join("comm"))?; - let commandline = std::fs::read_to_string(proc_path.join("cmdline"))?; - let io = std::fs::read_to_string(proc_path.join("io")).ok(); - let pid = proc_path .file_name() .context("proc_path terminates in ..")? @@ -267,6 +278,23 @@ impl ProcessData { .context("can't turn OsStr to str")? .parse()?; + trace!("Inspecting process {pid}…"); + + trace!("Reading info files…"); + let stat = std::fs::read_to_string(proc_path.join("stat")) + .inspect_err(|err| trace!("Error reading 'stat': {err}"))?; + let statm = std::fs::read_to_string(proc_path.join("statm")) + .inspect_err(|err| trace!("Error reading 'statm': {err}"))?; + let status = std::fs::read_to_string(proc_path.join("status")) + .inspect_err(|err| trace!("Error reading 'status': {err}"))?; + let comm = std::fs::read_to_string(proc_path.join("comm")) + .inspect_err(|err| trace!("Error reading 'comm': {err}"))?; + let commandline = std::fs::read_to_string(proc_path.join("cmdline")) + .inspect_err(|err| trace!("Error reading 'cmdline': {err}"))?; + let io = std::fs::read_to_string(proc_path.join("io")) + .inspect_err(|err| trace!("Error reading 'io': {err}")) + .ok(); + let user = USERS_CACHE .get(&Self::get_uid(proc_path)?) .cloned() @@ -275,7 +303,8 @@ impl ProcessData { let stat = stat .split(')') // since we don't care about the pid or the executable name, split after the executable name to make our life easier .last() - .context("stat doesn't have ')'")? + .context("stat doesn't have ')'") + .inspect_err(|err| trace!("Can't parse 'stat': {err}"))? .split(' ') .skip(1) // the first element would be a space, let's ignore that .collect::>(); @@ -288,23 +317,28 @@ impl ProcessData { let parent_pid = stat .get(STAT_PARENT_PID) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse parent pid from 'stat': {err}"))?; let user_cpu_time = stat .get(STAT_USER_CPU_TIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse user cpu time from 'stat': {err}"))?; let system_cpu_time = stat .get(STAT_SYSTEM_CPU_TIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse system cpu time from 'stat': {err}"))?; let nice = stat .get(STAT_NICE) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse nice from 'stat': {err}"))?; let starttime = stat .get(STAT_STARTTIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse start time from 'stat': {err}"))?; let mut affinity = Vec::with_capacity(*NUM_CPUS); RE_AFFINITY @@ -340,7 +374,8 @@ impl ProcessData { .and_then(|x| { x.parse::() .context("couldn't parse statm file content") - })? + }) + .inspect_err(|err| trace!("Can't parse memory usage from 'statm': {err}"))? .saturating_sub( statm .get(2) @@ -353,6 +388,7 @@ impl ProcessData { .saturating_mul(*PAGESIZE); let cgroup = std::fs::read_to_string(proc_path.join("cgroup")) + .inspect_err(|err| trace!("Can't read cgroup: {err}")) .ok() .and_then(Self::sanitize_cgroup); @@ -408,6 +444,7 @@ impl ProcessData { } fn gpu_usage_stats(proc_path: &Path, pid: i32) -> BTreeMap { + trace!("Gathering GPU stats…"); let nvidia_stats = Self::nvidia_gpu_stats_all(pid); let mut other_stats = Self::other_gpu_usage_stats(proc_path, pid).unwrap_or_default(); other_stats.extend(nvidia_stats); @@ -431,22 +468,30 @@ impl ProcessData { .parse::() .unwrap_or(0); if fd_num <= 2 { + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: fd_num ≤ 2 (probably std stream)" + ); return (false, fd_num); } let _file = std::fs::File::open(&fdinfo_path); if _file.is_err() { - return (true, fd_num); + trace!("fdinfo {fd_num} deemed as not plausible. Reason: File can't be opened"); + return (false, fd_num); } let file = _file.unwrap(); let _metadata = file.metadata(); if _metadata.is_err() { - return (true, fd_num); + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: File's metadata can't be read" + ); + return (false, fd_num); } let metadata = _metadata.unwrap(); if !metadata.is_file() { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: Not a file"); return (false, fd_num); } @@ -455,8 +500,15 @@ impl ProcessData { let fd_path = fdinfo_path.to_str().map(|s| s.replace("fdinfo", "fd")); if let Some(fd_path) = fd_path { if let Ok(fd_metadata) = std::fs::metadata(fd_path) { + if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: Wrong st_mode"); + return (false, fd_num); + } let major = unsafe { libc::major(fd_metadata.st_rdev()) }; - if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR || major != 226 { + if major != MAJOR { + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: Wrong major (expected: {MAJOR}, got: {major})" + ); return (false, fd_num); } } @@ -470,9 +522,11 @@ impl ProcessData { == 0 }); if not_unique { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: kcmp indicated that we've already seen this file"); return (false, fd_num); } + trace!("fdinfo {fd_num} deemed as plausible"); (true, fd_num) } @@ -480,6 +534,7 @@ impl ProcessData { proc_path: &Path, pid: i32, ) -> Result> { + trace!("Gathering other GPU stats…"); let fdinfo_dir = proc_path.join("fdinfo"); let mut seen_fds = HashSet::new(); @@ -520,6 +575,7 @@ impl ProcessData { } fn npu_usage_stats(proc_path: &Path, pid: i32) -> Result> { + trace!("Gathering NPU stats…"); let fdinfo_dir = proc_path.join("fdinfo"); let mut seen_fds = HashSet::new(); @@ -554,6 +610,10 @@ impl ProcessData { } fn read_npu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, NpuUsageStats)> { + trace!( + "Reading and parsing {} for NPU stats…", + fdinfo_path.as_ref().to_string_lossy() + ); let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let pci_slot = RE_DRM_PDEV @@ -569,6 +629,7 @@ impl ProcessData { .unwrap_or_default(); if !NPU_DRIVER_NAMES.contains(&driver) { + trace!("Driver '{driver}' is not known to be NPU-related, skipping"); bail!("this is not an NPU") } @@ -590,10 +651,16 @@ impl ProcessData { mem: total_memory, }; + trace!("Success reading GPU data for {pci_slot}: {stats:?}"); + return Ok((pci_slot, stats)); } fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats)> { + trace!( + "Reading and parsing {} for GPU stats…", + fdinfo_path.as_ref().to_string_lossy() + ); let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let pci_slot = RE_DRM_PDEV @@ -609,6 +676,7 @@ impl ProcessData { .unwrap_or_default(); if !GPU_DRIVER_NAMES.contains(&driver) { + trace!("Driver {driver} is not known to be GPU-related, skipping"); bail!("this is not a GPU"); } @@ -672,10 +740,14 @@ impl ProcessData { nvidia: false, }; + trace!("Success reading GPU data for {pci_slot}: {stats:?}"); + return Ok((pci_slot, stats)); } fn nvidia_gpu_stats_all(pid: i32) -> BTreeMap { + trace!("Gathering NVIDIA GPU stats…"); + let mut return_map = BTreeMap::new(); for (pci_slot, _) in NVML_DEVICES.iter() { @@ -688,6 +760,7 @@ impl ProcessData { } fn nvidia_gpu_stats(pid: i32, pci_slot: PciSlot) -> Result { + trace!("Gathering GPU stats for NVIDIA GPU at {pci_slot}…"); let this_process_stats = NVIDIA_PROCESSES_STATS .read() .unwrap() @@ -722,6 +795,7 @@ impl ProcessData { } fn nvidia_process_infos() -> HashMap> { + trace!("Refreshing NVIDIA process infos…"); let mut return_map = HashMap::new(); for (pci_slot, gpu) in NVML_DEVICES.iter() { @@ -735,6 +809,7 @@ impl ProcessData { } fn nvidia_process_stats() -> HashMap> { + trace!("Refreshing NVIDIA process stats…"); let mut return_map = HashMap::new(); for (pci_slot, gpu) in NVML_DEVICES.iter() { diff --git a/src/bin/resources-processes.rs b/src/bin/resources-processes.rs index 64569716..154d01ad 100644 --- a/src/bin/resources-processes.rs +++ b/src/bin/resources-processes.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use log::{info, trace}; use process_data::ProcessData; use ron::ser::PrettyConfig; use std::io::{Read, Write}; @@ -18,6 +19,11 @@ struct Args { } fn main() -> Result<()> { + // Initialize logger + pretty_env_logger::init(); + + info!("Starting resources-processes…"); + let args = Args::parse(); if args.once { @@ -29,12 +35,14 @@ fn main() -> Result<()> { let mut buffer = [0; 1]; std::io::stdin().read_exact(&mut buffer)?; + trace!("Received character"); output(args.ron)?; } } fn output(ron: bool) -> Result<()> { + trace!("Gathering process data…"); let data = ProcessData::all_process_data()?; let encoded = if ron { @@ -50,10 +58,13 @@ fn output(ron: bool) -> Result<()> { let stdout = std::io::stdout(); let mut handle = stdout.lock(); + trace!("Sending content length ({})…", encoded.len()); handle.write_all(&len_byte_array)?; + trace!("Sending content…"); handle.write_all(&encoded)?; + trace!("Flushing…"); handle.flush()?; Ok(()) } diff --git a/src/utils/gpu/nvidia.rs b/src/utils/gpu/nvidia.rs index f1668f33..4f47cfd4 100644 --- a/src/utils/gpu/nvidia.rs +++ b/src/utils/gpu/nvidia.rs @@ -10,15 +10,9 @@ use process_data::pci_slot::PciSlot; use std::{path::PathBuf, sync::LazyLock}; static NVML: LazyLock> = LazyLock::new(|| { - let nvml = Nvml::init(); - - if let Err(error) = nvml.as_ref() { - warn!("Connection to NVML failed, reason: {error}"); - } else { - debug!("Successfully connected to NVML"); - } - - nvml + Nvml::init() + .inspect_err(|err| warn!("Unable to connect to NVML: {err}")) + .inspect(|_| debug!("Successfully connected to NVML")) }); use crate::utils::pci::Device; diff --git a/src/utils/process.rs b/src/utils/process.rs index 0680f715..bf228d85 100644 --- a/src/utils/process.rs +++ b/src/utils/process.rs @@ -35,18 +35,29 @@ static OTHER_PROCESS: LazyLock> = LazyLock::new let child = if *IS_FLATPAK { debug!("Spawning resources-processes in Flatpak mode ({proxy_path})"); Command::new(FLATPAK_SPAWN) - .args(["--host", proxy_path.as_str()]) + .args([ + &format!( + "--env=RUST_LOG={}", + std::env::var("RUST_LOG=resources").unwrap_or("warn".into()) + ), + "--host", + proxy_path.as_str(), + ]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) - .stderr(Stdio::null()) + .stderr(Stdio::inherit()) .spawn() .unwrap() } else { debug!("Spawning resources-processes in native mode ({proxy_path})"); Command::new(proxy_path) + .arg(&format!( + "--env=RUST_LOG={}", + std::env::var("RUST_LOG=resources").unwrap_or("warn".into()) + )) .stdin(Stdio::piped()) .stdout(Stdio::piped()) - .stderr(Stdio::null()) + .stderr(Stdio::inherit()) .spawn() .unwrap() };