From d7728c095795b1b2060e398d090250f4a3534175 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 18 Apr 2024 09:22:26 +0200 Subject: [PATCH] feat: add "sort-and-run" subcommand wip --- Cargo.lock | 33 +- Cargo.toml | 1 + packages/nextclade-cli/Cargo.toml | 1 + packages/nextclade-cli/src/cli/mod.rs | 1 + .../nextclade-cli/src/cli/nextclade_cli.rs | 6 +- .../nextclade-cli/src/cli/nextclade_loop.rs | 1 + .../cli/nextclade_run_sort_and_analysis.rs | 514 ++++++++++++++++++ .../src/cli/nextclade_seq_sort.rs | 8 +- .../src/dataset/dataset_download.rs | 31 ++ packages/nextclade/src/utils/entry.rs | 49 ++ packages/nextclade/src/utils/mod.rs | 1 + 11 files changed, 622 insertions(+), 24 deletions(-) create mode 100644 packages/nextclade-cli/src/cli/nextclade_run_sort_and_analysis.rs create mode 100644 packages/nextclade/src/utils/entry.rs diff --git a/Cargo.lock b/Cargo.lock index 6b97ede4a..6786b3455 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -700,15 +700,11 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] @@ -722,14 +718,21 @@ dependencies = [ ] [[package]] -name = "crossbeam-utils" -version = "0.8.16" +name = "crossbeam-skiplist" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" dependencies = [ - "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + [[package]] name = "crossterm" version = "0.26.1" @@ -1613,15 +1616,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - [[package]] name = "mime" version = "0.3.17" @@ -1786,6 +1780,7 @@ dependencies = [ "criterion", "crossbeam", "crossbeam-channel", + "crossbeam-skiplist", "ctor 0.2.2", "dotenv", "dotenv_codegen", diff --git a/Cargo.toml b/Cargo.toml index 7d895943c..117214ff2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ console_error_panic_hook = "=0.1.7" criterion = { version = "=0.5.1", features = ["html_reports"] } crossbeam = "=0.8.2" crossbeam-channel = "=0.5.8" +crossbeam-skiplist = "=0.1.3" csv = "=1.2.2" ctor = "=0.2.2" derive_more = "=0.99.17" diff --git a/packages/nextclade-cli/Cargo.toml b/packages/nextclade-cli/Cargo.toml index 6072a8469..6cb142625 100644 --- a/packages/nextclade-cli/Cargo.toml +++ b/packages/nextclade-cli/Cargo.toml @@ -25,6 +25,7 @@ color-eyre = { workspace = true } comfy-table = { workspace = true } crossbeam = { workspace = true } crossbeam-channel = { workspace = true } +crossbeam-skiplist = { workspace = true } ctor = { workspace = true } dotenv = { workspace = true } dotenv_codegen = { workspace = true } diff --git a/packages/nextclade-cli/src/cli/mod.rs b/packages/nextclade-cli/src/cli/mod.rs index 27e87afc3..7e69afcff 100644 --- a/packages/nextclade-cli/src/cli/mod.rs +++ b/packages/nextclade-cli/src/cli/mod.rs @@ -4,6 +4,7 @@ pub mod nextclade_dataset_list; pub mod nextclade_loop; pub mod nextclade_ordered_writer; pub mod nextclade_read_annotation; +pub mod nextclade_run_sort_and_analysis; pub mod nextclade_seq_sort; pub mod print_help_markdown; pub mod verbosity; diff --git a/packages/nextclade-cli/src/cli/nextclade_cli.rs b/packages/nextclade-cli/src/cli/nextclade_cli.rs index 4788a5acd..bf685eaf5 100644 --- a/packages/nextclade-cli/src/cli/nextclade_cli.rs +++ b/packages/nextclade-cli/src/cli/nextclade_cli.rs @@ -2,6 +2,7 @@ use crate::cli::nextclade_dataset_get::nextclade_dataset_get; use crate::cli::nextclade_dataset_list::nextclade_dataset_list; use crate::cli::nextclade_loop::nextclade_run; use crate::cli::nextclade_read_annotation::nextclade_read_annotation; +use crate::cli::nextclade_run_sort_and_analysis::nextclade_run_sort_and_analysis; use crate::cli::nextclade_seq_sort::nextclade_seq_sort; use crate::cli::print_help_markdown::print_help_markdown; use crate::cli::verbosity::Verbosity; @@ -95,6 +96,8 @@ pub enum NextcladeCommands { /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade sort --help`. Sort(Box), + SortAndRun(Box), + /// Read genome annotation and present it in Nextclade's internal formats. This is mostly only useful for Nextclade maintainers and the most curious users. Note that these internal formats have no stability guarantees and can be changed at any time without notice. /// /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade sort --help`. @@ -442,7 +445,7 @@ pub struct NextcladeRunInputArgs { } #[allow(clippy::struct_excessive_bools)] -#[derive(Parser, Debug, Clone)] +#[derive(Parser, Debug, Clone, Default)] pub struct NextcladeRunOutputArgs { /// REMOVED. Use `--output-all` instead #[clap(long)] @@ -1128,6 +1131,7 @@ pub fn nextclade_parse_cli_args() -> Result<(), Report> { } }, NextcladeCommands::Sort(seq_sort_args) => nextclade_seq_sort(&seq_sort_args), + NextcladeCommands::SortAndRun(args) => nextclade_run_sort_and_analysis(&args), NextcladeCommands::ReadAnnotation(read_annotation_args) => nextclade_read_annotation(&read_annotation_args), } } diff --git a/packages/nextclade-cli/src/cli/nextclade_loop.rs b/packages/nextclade-cli/src/cli/nextclade_loop.rs index 1eaf4d54c..e42e7df9d 100644 --- a/packages/nextclade-cli/src/cli/nextclade_loop.rs +++ b/packages/nextclade-cli/src/cli/nextclade_loop.rs @@ -17,6 +17,7 @@ use nextclade::tree::tree_builder::graph_attach_new_nodes_in_place; use nextclade::types::outputs::NextcladeOutputs; use nextclade::utils::option::OptionMapRefFallible; +#[derive(Debug)] pub struct NextcladeRecord { pub index: usize, pub seq_name: String, diff --git a/packages/nextclade-cli/src/cli/nextclade_run_sort_and_analysis.rs b/packages/nextclade-cli/src/cli/nextclade_run_sort_and_analysis.rs new file mode 100644 index 000000000..3b67e611e --- /dev/null +++ b/packages/nextclade-cli/src/cli/nextclade_run_sort_and_analysis.rs @@ -0,0 +1,514 @@ +use crate::cli::nextclade_cli::{ + NextcladeOutputSelection, NextcladeRunOtherParams, NextcladeRunOutputArgs, NextcladeSortArgs, +}; +use crate::cli::nextclade_loop::NextcladeRecord; +use crate::cli::nextclade_ordered_writer::NextcladeOrderedWriter; +use crate::cli::nextclade_seq_sort::{get_all_prefix_names, StatsPrinter}; +use crate::dataset::dataset_download::{dataset_download_by_name, download_datasets_index_json}; +use crate::io::http_client::{HttpClient, ProxyConfig}; +use crossbeam_channel::{Receiver, Sender}; +use crossbeam_skiplist::SkipMap; +use eyre::{ContextCompat, Report, WrapErr}; +use itertools::Itertools; +use log::LevelFilter; +use nextclade::io::csv::CsvStructFileWriter; +use nextclade::io::fasta::{FastaReader, FastaRecord, FastaWriter}; +use nextclade::io::fs::add_extension; +use nextclade::io::nextclade_csv::CsvColumnConfig; +use nextclade::make_error; +use nextclade::run::nextclade_wasm::{AnalysisInitialData, AnalysisOutput, Nextclade}; +use nextclade::run::params::NextcladeInputParamsOptional; +use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION}; +use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchResult}; +use nextclade::sort::params::NextcladeSeqSortParams; +use nextclade::utils::entry::MapEntryFallible; +use nextclade::utils::option::{OptionMapMutFallible, OptionMapRefFallible}; +use schemars::JsonSchema; +use serde::Serialize; +use std::collections::btree_map::Entry::{Occupied, Vacant}; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use strum::IntoEnumIterator; +use tinytemplate::TinyTemplate; +use url::Url; + +pub fn nextclade_run_sort_and_analysis(args: &NextcladeSortArgs) -> Result<(), Report> { + // info!("Command-line arguments:\n{run_args:#?}"); + + let NextcladeSortArgs { + input_minimizer_index_json, + server, + proxy_config, + .. + } = args; + + // let NextcladeRunArgs { + // inputs: NextcladeRunInputArgs { + // input_fastas, + // cds_selection: cdses, + // .. + // }, + // outputs: + // NextcladeRunOutputArgs { + // output_columns_selection, + // output_graph, + // output_tree, + // output_tree_nwk, + // .. + // }, + // params, + // other_params: NextcladeRunOtherParams { jobs }, + // } = run_args.clone(); + + let verbose = log::max_level() >= LevelFilter::Info; + + let minimizer_index = if let Some(input_minimizer_index_json) = &input_minimizer_index_json { + // If a file is provided, use data from it + MinimizerIndexJson::from_path(input_minimizer_index_json) + } else { + // Otherwise fetch from dataset server + let http = HttpClient::new(server, proxy_config, verbose)?; + let index = download_datasets_index_json(&http)?; + let minimizer_index_path = index + .minimizer_index + .iter() + .find(|minimizer_index| MINIMIZER_INDEX_ALGO_VERSION == minimizer_index.version) + .map(|minimizer_index| &minimizer_index.path); + + if let Some(minimizer_index_path) = minimizer_index_path { + let minimizer_index_str = http.get(minimizer_index_path)?; + MinimizerIndexJson::from_str(String::from_utf8(minimizer_index_str)?) + } else { + let server_versions = index + .minimizer_index + .iter() + .map(|minimizer_index| format!("'{}'", minimizer_index.version)) + .join(","); + let server_versions = if server_versions.is_empty() { + "none available".to_owned() + } else { + format!(": {server_versions}") + }; + + make_error!("No compatible reference minimizer index data is found for this dataset sever. Cannot proceed. \n\nThis version of Nextclade supports index versions up to '{}', but the server has {}.\n\nTry to to upgrade Nextclade to the latest version and/or contact dataset server maintainers.", MINIMIZER_INDEX_ALGO_VERSION, server_versions) + } + }?; + + run(args, &minimizer_index, verbose) +} + +pub fn run(args: &NextcladeSortArgs, minimizer_index: &MinimizerIndexJson, verbose: bool) -> Result<(), Report> { + let NextcladeSortArgs { + input_fastas, + search_params, + other_params: NextcladeRunOtherParams { jobs }, + server, + output_dir, + // output_columns_selection, + // output_selection, + .. + } = args; + + let csv_column_config = CsvColumnConfig::new(&[])?; + let output_basename = "nextclade".to_owned(); + let output_selection = &NextcladeOutputSelection::iter().collect_vec(); + let should_write_tree = [NextcladeOutputSelection::Tree, NextcladeOutputSelection::TreeNwk] + .iter() + .any(|x| output_selection.contains(x)); + + let nextclades = &SkipMap::new(); + + std::thread::scope(|s| { + const CHANNEL_SIZE: usize = 128; + let (fasta_sender, fasta_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); + let (result_sender, result_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); + + s.spawn(|| { + let mut reader = FastaReader::from_paths(input_fastas).unwrap(); + loop { + let mut record = FastaRecord::default(); + reader.read(&mut record).unwrap(); + if record.is_empty() { + break; + } + fasta_sender + .send(record) + .wrap_err("When sending a FastaRecord") + .unwrap(); + } + drop(fasta_sender); + }); + + for _ in 0..*jobs { + let fasta_receiver = fasta_receiver.clone(); + let result_sender = result_sender.clone(); + + s.spawn(move || { + worker_thread( + minimizer_index, + verbose, + search_params, + server, + &fasta_receiver, + &result_sender, + nextclades, + ) + .unwrap(); + + drop(result_sender); + }); + } + + s.spawn(move || { + let mut sort_writer = SortWriter::new(args, verbose).unwrap(); + let mut output_writers = BTreeMap::new(); + let mut outputs = BTreeMap::new(); + + for record in result_receiver { + let WorkerResultRecord { + dataset_name, + fasta_record, + sort_result, + analysis_result, + } = record; + + sort_writer.write(sort_result, &fasta_record).unwrap(); + + let nextclade = nextclades.get(&dataset_name).unwrap(); + let nextclade = nextclade.value(); + + let output_writer = output_writers + .entry(dataset_name.clone()) + .or_insert_with_key_fallible(|dataset_name| -> Result { + let AnalysisInitialData { + clade_node_attr_key_descs, + phenotype_attr_descs, + aa_motif_keys, + .. + } = nextclade.get_initial_data(); + + let default_output_file_path = output_dir.clone().unwrap().join(dataset_name).join(&output_basename); + + let output_fasta = output_selection + .contains(&NextcladeOutputSelection::Fasta) + .then(|| add_extension(&default_output_file_path, "aligned.fasta")); + + let output_translations = output_selection + .contains(&NextcladeOutputSelection::Translations) + .then(|| -> Result { + let output_translations_path = + default_output_file_path.with_file_name(format!("{output_basename}.cds_translation.{{cds}}")); + let output_translations_path = add_extension(output_translations_path, "fasta"); + + let output_translations = output_translations_path + .to_str() + .wrap_err_with(|| format!("When converting path to string: '{output_translations_path:?}'"))? + .to_owned(); + + Ok(output_translations) + }) + .transpose()?; + + let output_ndjson = output_selection + .contains(&NextcladeOutputSelection::Ndjson) + .then(|| add_extension(&default_output_file_path, "ndjson")); + + let output_json = output_selection + .contains(&NextcladeOutputSelection::Json) + .then(|| add_extension(&default_output_file_path, "json")); + + let output_csv = output_selection + .contains(&NextcladeOutputSelection::Csv) + .then(|| add_extension(&default_output_file_path, "csv")); + + let output_tsv = output_selection + .contains(&NextcladeOutputSelection::Tsv) + .then(|| add_extension(&default_output_file_path, "tsv")); + + let mut output_writer = NextcladeOrderedWriter::new( + &nextclade.gene_map, + clade_node_attr_key_descs, + phenotype_attr_descs, + aa_motif_keys, + &csv_column_config, + &NextcladeRunOutputArgs { + output_dir: output_dir.to_owned(), + output_fasta, + output_translations, + output_ndjson, + output_json, + output_csv, + output_tsv, + ..NextcladeRunOutputArgs::default() + }, + &nextclade.params, + ) + .wrap_err("When creating output writer")?; + + if nextclade.params.general.include_reference { + output_writer + .write_ref(&nextclade.ref_record, &nextclade.ref_translation) + .wrap_err("When writing output record for ref sequence")?; + } + Ok(output_writer) + }) + .unwrap(); + + if should_write_tree { + // Save analysis results if they will be needed later + if let Ok(AnalysisOutput { analysis_result, .. }) = &analysis_result.outputs_or_err { + outputs + .entry(dataset_name.clone()) + .or_insert_with(|| vec![analysis_result.clone()]) + .push(analysis_result.clone()); + } + } + + output_writer + .write_record(analysis_result) + .wrap_err("When writing output record") + .unwrap(); + } + + // // for (dataset_name, nextclade) in nextclades { + // for entry in nextclades { + // // let writer = output_writers.remove(&dataset_name).unwrap(); + // let outputs = outputs.remove(&dataset_name).unwrap(); + // + // let Nextclade { + // ref_seq, params, graph, .. + // } = nextclade; + // + // if let Some(mut graph) = graph { + // graph_attach_new_nodes_in_place(&mut graph, outputs, ref_seq.len(), ¶ms.tree_builder).unwrap(); + // + // let default_output_file_path = output_dir.clone().unwrap().join(dataset_name).join(&output_basename); + // + // if output_selection.contains(&NextcladeOutputSelection::Tree) { + // let tree = convert_graph_to_auspice_tree(&graph).unwrap(); + // let output_tree = add_extension(&default_output_file_path, "auspice.json"); + // json_write(output_tree, &tree, JsonPretty(true)).unwrap(); + // } + // + // if output_selection.contains(&NextcladeOutputSelection::TreeNwk) { + // let output_tree_nwk = add_extension(&default_output_file_path, "nwk"); + // nwk_write_to_file(output_tree_nwk, &graph).unwrap(); + // } + // } + // } + }); + }); + + Ok(()) +} + +#[derive(Debug)] +pub struct WorkerResultRecord { + pub fasta_record: FastaRecord, + pub sort_result: MinimizerSearchResult, + pub analysis_result: NextcladeRecord, + pub dataset_name: String, +} + +fn worker_thread( + minimizer_index: &MinimizerIndexJson, + verbose: bool, + search_params: &NextcladeSeqSortParams, + server: &Url, + fasta_receiver: &Receiver, + result_sender: &Sender, + nextclades: &SkipMap, +) -> Result<(), Report> { + for fasta_record in fasta_receiver { + // info!("Processing sequence '{}'", fasta_record.seq_name); + + let sort_result = run_minimizer_search(&fasta_record, minimizer_index, search_params).wrap_err_with(|| { + format!( + "When processing sequence #{} '{}'", + fasta_record.index, fasta_record.seq_name + ) + })?; + + let http = HttpClient::new(server, &ProxyConfig::default(), verbose)?; + if let Some(dataset) = sort_result.datasets.first() { + let nextclade_kv = nextclades.get_or_insert_with(dataset.name.clone(), || { + let inputs = dataset_download_by_name(&http, &dataset.name).unwrap(); + Nextclade::new(inputs, vec![], &NextcladeInputParamsOptional::default()).unwrap() + }); + let nextclade = nextclade_kv.value(); + + let outputs_or_err = nextclade.run(&fasta_record).wrap_err_with(|| { + format!( + "When processing sequence #{} '{}'", + fasta_record.index, fasta_record.seq_name + ) + }); + + let analysis_result = NextcladeRecord { + index: fasta_record.index, + seq_name: fasta_record.seq_name.clone(), + outputs_or_err, + }; + + result_sender + .send(WorkerResultRecord { + dataset_name: dataset.name.clone(), + fasta_record, + sort_result, + analysis_result, + }) + .wrap_err("When sending minimizer record into the channel")?; + } else { + // TODO(sort-and-run): decide what to do if dataset is not detected + } + } + + Ok(()) +} + +pub struct SortWriter<'a> { + all_matches: bool, + writers: BTreeMap, + stats: StatsPrinter, + output_dir: Option, + template: Option>, + results_csv: Option, +} + +impl Drop for SortWriter<'_> { + fn drop(&mut self) { + self.stats.finish(); + } +} + +impl<'a> SortWriter<'a> { + pub fn new(args: &'a NextcladeSortArgs, verbose: bool) -> Result { + let NextcladeSortArgs { + output_dir, + output_path, + output_results_tsv, + search_params, + .. + } = args; + + let template = output_path.map_ref_fallible(move |output_path| -> Result { + let mut template = TinyTemplate::new(); + template + .add_template("output", output_path) + .wrap_err_with(|| format!("When parsing template: '{output_path}'"))?; + Ok(template) + })?; + + let stats = StatsPrinter::new(verbose); + + let results_csv = + output_results_tsv.map_ref_fallible(|output_results_tsv| CsvStructFileWriter::new(output_results_tsv, b'\t'))?; + + Ok(Self { + writers: BTreeMap::new(), + all_matches: search_params.all_matches, + output_dir: output_dir.clone(), + template, + stats, + results_csv, + }) + } + + pub fn write(&mut self, result: MinimizerSearchResult, fasta_record: &FastaRecord) -> Result<(), Report> { + let datasets = &{ + if self.all_matches { + result.datasets + } else { + result.datasets.into_iter().take(1).collect_vec() + } + }; + + self.stats.print_seq(datasets, &fasta_record.seq_name); + + if datasets.is_empty() { + self.results_csv.map_mut_fallible(|results_csv| { + results_csv.write(&SeqSortCsvEntry { + index: fasta_record.index, + seq_name: &fasta_record.seq_name, + dataset: None, + score: None, + num_hits: None, + }) + })?; + } + + for dataset in datasets { + self.results_csv.map_mut_fallible(|results_csv| { + results_csv.write(&SeqSortCsvEntry { + index: fasta_record.index, + seq_name: &fasta_record.seq_name, + dataset: Some(&dataset.name), + score: Some(dataset.score), + num_hits: Some(dataset.n_hits), + }) + })?; + } + + let names = datasets + .iter() + .map(|dataset| get_all_prefix_names(&dataset.name)) + .collect::>, Report>>()? + .into_iter() + .flatten() + .unique(); + + for name in names { + let filepath = Self::get_filepath(&name, &self.template, &self.output_dir)?; + + if let Some(filepath) = filepath { + let writer = Self::get_or_insert_writer(&mut self.writers, filepath)?; + writer.write(&fasta_record.seq_name, &fasta_record.seq, false)?; + } + } + + Ok(()) + } + + fn get_or_insert_writer( + writers: &mut BTreeMap, + filepath: impl AsRef, + ) -> Result<&mut FastaWriter, Report> { + Ok(match writers.entry(filepath.as_ref().to_owned()) { + Occupied(e) => e.into_mut(), + Vacant(e) => e.insert(FastaWriter::from_path(filepath)?), + }) + } + + fn get_filepath( + name: &str, + tt: &Option, + output_dir: &Option, + ) -> Result, Report> { + Ok(match (&tt, output_dir) { + (Some(tt), None) => { + let filepath_str = tt + .render("output", &OutputTemplateContext { name }) + .wrap_err("When rendering output path template")?; + Some(PathBuf::from_str(&filepath_str).wrap_err_with(|| format!("Invalid output path: '{filepath_str}'"))?) + } + (None, Some(output_dir)) => Some(output_dir.join(name).join("sequences.fasta")), + _ => None, + }) + } +} + +#[derive(Clone, Default, Debug, Serialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct SeqSortCsvEntry<'a> { + pub index: usize, + pub seq_name: &'a str, + pub dataset: Option<&'a str>, + pub score: Option, + pub num_hits: Option, +} + +#[derive(Serialize)] +struct OutputTemplateContext<'a> { + name: &'a str, +} diff --git a/packages/nextclade-cli/src/cli/nextclade_seq_sort.rs b/packages/nextclade-cli/src/cli/nextclade_seq_sort.rs index 0a29cffe0..e98881a24 100644 --- a/packages/nextclade-cli/src/cli/nextclade_seq_sort.rs +++ b/packages/nextclade-cli/src/cli/nextclade_seq_sort.rs @@ -243,14 +243,14 @@ pub fn get_all_prefix_names(name: impl AsRef) -> Result, Report .collect() } -struct StatsPrinter { +pub struct StatsPrinter { enabled: bool, stats: BTreeMap, n_undetected: usize, } impl StatsPrinter { - fn new(enabled: bool) -> Self { + pub fn new(enabled: bool) -> Self { if enabled { println!("Suggested datasets for each sequence"); println!("{}┐", "─".repeat(110)); @@ -268,7 +268,7 @@ impl StatsPrinter { } } - fn print_seq(&mut self, datasets: &[MinimizerSearchDatasetResult], seq_name: &str) { + pub fn print_seq(&mut self, datasets: &[MinimizerSearchDatasetResult], seq_name: &str) { if !self.enabled { return; } @@ -304,7 +304,7 @@ impl StatsPrinter { println!("{}┤", "─".repeat(110)); } - fn finish(&self) { + pub fn finish(&self) { if !self.enabled { return; } diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs index 6bf321692..714be7246 100644 --- a/packages/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages/nextclade-cli/src/dataset/dataset_download.rs @@ -353,3 +353,34 @@ pub fn dataset_str_download_and_load( virus_properties, }) } + +pub fn dataset_download_by_name(http: &HttpClient, dataset_name: impl AsRef) -> Result { + let dataset = dataset_http_get(&http, dataset_name.as_ref(), &None)?; + + let virus_properties: VirusProperties = + VirusProperties::from_str(&dataset_file_http_get(&http, &dataset, &dataset.files.pathogen_json)?)?; + + let ref_record = read_one_fasta_str(dataset_file_http_get(&http, &dataset, &dataset.files.reference)?)?; + + let gene_map = dataset + .files + .genome_annotation + .as_ref() + .map_ref_fallible(|genome_annotation| { + let s = dataset_file_http_get(&http, &dataset, genome_annotation)?; + GeneMap::from_str(s) + })? + .unwrap_or_default(); + + let tree = dataset.files.tree_json.as_ref().map_ref_fallible(|tree_json| { + let s = dataset_file_http_get(&http, &dataset, tree_json)?; + AuspiceTree::from_str(s) + })?; + + Ok(NextcladeParams { + ref_record, + gene_map, + tree, + virus_properties, + }) +} diff --git a/packages/nextclade/src/utils/entry.rs b/packages/nextclade/src/utils/entry.rs new file mode 100644 index 000000000..e8b83741e --- /dev/null +++ b/packages/nextclade/src/utils/entry.rs @@ -0,0 +1,49 @@ +use std::collections::{btree_map, hash_map}; + +pub trait MapEntryFallible<'a, K, V, E> { + fn or_insert_with_fallible Result>(self, default: F) -> Result<&'a mut V, E>; + + fn or_insert_with_key_fallible Result>(self, default: F) -> Result<&'a mut V, E>; +} + +impl<'a, K: Ord, V, E> MapEntryFallible<'a, K, V, E> for btree_map::Entry<'a, K, V> { + #[inline] + fn or_insert_with_fallible Result>(self, default: F) -> Result<&'a mut V, E> { + Ok(match self { + btree_map::Entry::Occupied(entry) => entry.into_mut(), + btree_map::Entry::Vacant(entry) => entry.insert(default()?), + }) + } + + #[inline] + fn or_insert_with_key_fallible Result>(self, default: F) -> Result<&'a mut V, E> { + Ok(match self { + btree_map::Entry::Occupied(entry) => entry.into_mut(), + btree_map::Entry::Vacant(entry) => { + let value = default(entry.key())?; + entry.insert(value) + } + }) + } +} + +impl<'a, K: Ord, V, E> MapEntryFallible<'a, K, V, E> for hash_map::Entry<'a, K, V> { + #[inline] + fn or_insert_with_fallible Result>(self, default: F) -> Result<&'a mut V, E> { + Ok(match self { + hash_map::Entry::Occupied(entry) => entry.into_mut(), + hash_map::Entry::Vacant(entry) => entry.insert(default()?), + }) + } + + #[inline] + fn or_insert_with_key_fallible Result>(self, default: F) -> Result<&'a mut V, E> { + Ok(match self { + hash_map::Entry::Occupied(entry) => entry.into_mut(), + hash_map::Entry::Vacant(entry) => { + let value = default(entry.key())?; + entry.insert(value) + } + }) + } +} diff --git a/packages/nextclade/src/utils/mod.rs b/packages/nextclade/src/utils/mod.rs index a2f3e8517..7cfafb47d 100644 --- a/packages/nextclade/src/utils/mod.rs +++ b/packages/nextclade/src/utils/mod.rs @@ -2,6 +2,7 @@ pub mod any; pub mod boolean; pub mod collections; pub mod datetime; +pub mod entry; pub mod error; pub mod getenv; pub mod global_init;