Skip to content

Commit

Permalink
Add option to dump input json to stdout
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
  • Loading branch information
newsch committed Jan 24, 2024
1 parent 93f4118 commit 2982f90
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 42 deletions.
93 changes: 52 additions & 41 deletions src/get_articles.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,36 @@
use std::{
borrow::Cow,
fs::{self, File},
io::{stdin, BufRead, Write},
io::{stdin, stdout, BufRead, Write},
os::unix,
path::{Path, PathBuf},
};

use anyhow::{anyhow, bail, Context};

use om_wikiparser::{
html::{self, HtmlError},
parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
html, parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file,
wm::{Page, Title},
};

#[derive(clap::ValueEnum, Copy, Clone)]
pub enum DumpFilter {
Match,
Error,
Panic, // FIXME: move panic dumping to this
}

/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
///
/// Expects an uncompressed dump (newline-delimited JSON) connected to stdin.
#[derive(clap::Args)]
pub struct Args {
/// Directory to write the extracted articles to.
pub output_dir: PathBuf,
#[arg(required_unless_present = "dump_json")]
pub output_dir: Option<PathBuf>,

#[arg(long)]
pub dump_json: Option<DumpFilter>,

/// Path to a TSV file that contains one or more of `wikidata`, `wikipedia` columns.
///
Expand Down Expand Up @@ -103,10 +114,14 @@ pub fn run(args: Args) -> anyhow::Result<()> {
.map(|p| File::options().create(true).append(true).open(p))
.transpose()?;

if !args.output_dir.is_dir() {
bail!("output dir {:?} does not exist", args.output_dir)
if let Some(output_dir) = &args.output_dir {
if !output_dir.is_dir() {
bail!("output dir {:?} does not exist", output_dir);
}
}

let mut stdout = stdout();

info!("Processing dump");
let mut dump = stdin().lock();

Expand Down Expand Up @@ -179,8 +194,31 @@ pub fn run(args: Args) -> anyhow::Result<()> {
}
}

if let Err(e) = write(&args.output_dir, &page, matching_titles, !args.no_simplify) {
error!("Error writing article: {:#}", e);
// Always write regardless of later errors.
if let Some(DumpFilter::Match) = args.dump_json {
stdout.write_all(buffer.as_bytes())?;
}

let article_output = if args.no_simplify {
Ok(Cow::Borrowed(&page.article_body.html))
} else {
html::process_str(&page.article_body.html, &page.in_language.identifier).map(Cow::Owned)
};

match article_output {
Err(e) => {
error!("Error processing article: {:#}", e);
if let Some(DumpFilter::Error) = args.dump_json {
stdout.write_all(buffer.as_bytes())?;
}
}
Ok(html) => {
if let Some(output_dir) = args.output_dir.as_ref() {
if let Err(e) = write(output_dir, &page, matching_titles, &html) {
error!("Error writing article: {:#}", e);
}
}
}
}
}

Expand Down Expand Up @@ -275,47 +313,20 @@ fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = Title>,
simplify: bool,
html: &str,
) -> anyhow::Result<()> {
let html = if !simplify {
page.article_body.html.to_string()
} else {
match html::process_str(&page.article_body.html, &page.in_language.identifier) {
Ok(html) => html,
Err(HtmlError::Panic(msg)) => {
// Write original article text to disk
let mut error_file = base.as_ref().to_path_buf();
error_file.push("errors");
if !error_file.exists() {
fs::create_dir(&error_file).context("creating error directory")?;
}
error_file.push(page.name.replace('/', "%2F"));
error_file.set_extension("html");

fs::write(&error_file, &page.article_body.html).context("writing error file")?;

if !msg.is_empty() {
bail!("panic occurred while processing html (saved to {error_file:?}): {msg}");
} else {
bail!("panic occurred while processing html (saved to {error_file:?})");
}
}
Err(e) => bail!(e),
}
};

let article_dir = create_article_dir(&base, page, redirects)?;

// Write html to determined file.
let mut filename = article_dir;
filename.push(&page.in_language.identifier);
filename.set_extension("html");

debug!("{:?}: {:?}", page.name, filename);

if filename.exists() {
debug!("Overwriting existing file");
}
debug!(
file = filename.to_string_lossy().as_ref(),
exists = filename.exists(),
"Writing article"
);

let mut file =
File::create(&filename).with_context(|| format!("creating html file {:?}", filename))?;
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<T
.lines()
.enumerate()
.map(|(i, line)| {
Title::from_url(line).with_context(|| {
Title::from_osm_tag(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
Expand Down

0 comments on commit 2982f90

Please sign in to comment.