Skip to content

Commit

Permalink
make JSON-to-CSV, remespath to_csv use same algo
Browse files Browse the repository at this point in the history
- ensure that delimiter inside non-strings are escaped in JSON-to-CSV
- ensure that null is represented as empty string in RemesPath to_csv
- add more tests for quoted numbers in CSV
  • Loading branch information
molsonkiko committed Dec 7, 2023
1 parent 26cbd4c commit 9328eb5
Show file tree
Hide file tree
Showing 13 changed files with 171 additions and 145 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- bug with calling arg functions on projections - seems like object projections are treated as arrays when calling arg functions on them in some cases?
- issue with treeview closing when a file with a treeview is moved from one view to another
- `loop()` function used in `s_sub` callbacks is not thread-safe. This doesn't matter right now because RemesPath is single-threaded, but it could matter in the future.
- __Known issues with `Select This` and `Select all children` commands in tree view (FIX THESE ASAP)__:
* Empty strings are ignored by `Select this`
* Floating point numbers may not be selected correctly (JSON string may have different length from the number representation in the CSV)

## [6.0.0] - (UNRELEASED) 2023-MM-DD

Expand Down
2 changes: 1 addition & 1 deletion JsonToolsNppPlugin/Forms/JsonToCsvForm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Stringify iterables
Dictionary<string, object> schema = JsonSchemaMaker.BuildSchema(json);
JNode tab = tabularizer.BuildTable(json, schema, keysep);
string eol = Npp.GetEndOfLineString(eolComboBox.SelectedIndex);
csv = tabularizer.TableToCsv((JArray)tab, delim, '"', null, BoolsToIntsCheckBox.Checked, eol);
csv = tabularizer.TableToCsv((JArray)tab, delim, '"', eol, null, BoolsToIntsCheckBox.Checked);
}
catch (Exception ex)
{
Expand Down
44 changes: 41 additions & 3 deletions JsonToolsNppPlugin/Forms/TreeViewer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,44 @@ public string KeyOfTreeNode(TreeNode node, KeyStyle style)
return JNode.FormatKey(node.Name, style);
}

/// <summary>
/// how long we think the representation of a JNode is in regex mode.<br></br>
/// if delim is '\x00' (not in CSV mode), this is just the length of its UTF8 repr.<br></br>
/// Otherwise, we do some special casing.
/// </summary>
/// <param name="jnode"></param>
/// <param name="startpos"></param>
/// <param name="delim"></param>
/// <param name="quote"></param>
/// <returns></returns>
public static int LengthOfStringInRegexMode(JNode jnode, int startpos, char delim, char quote)
{
string s = jnode.ValueOrToString();
int utf8len = Encoding.UTF8.GetByteCount(s);
if (delim == 0)
return utf8len; // not a CSV document, so we assume it's the same length in the document
int quoteCount = 0;
bool delimOrNewline = false;
for (int ii = 0; ii < s.Length; ii++)
{
char c = s[ii];
if (c == delim || c == '\r' || c == '\n')
delimOrNewline = true;
else if (c == quote)
quoteCount++;
}
if (!delimOrNewline && quoteCount == 0)
{
// in general, if a string has no delimiters or quotes or newlines, it doesn't need to be wrapped in quotes.
// but it still *could* be wrapped in quotes, and that would be equivalent, so we need to check.
// note that this assumes the quote character is ASCII, which seems eminently reasonable.
int startByte = Npp.editor.GetCharAt(startpos);
return quote == startByte ? utf8len + 2 : utf8len;
}
// it must be a quoted string, in which case it's wrapped in quotes (2 extra bytes) and every literal quote inside is doubled up (quoteCount extra bytes)
return utf8len + quoteCount + 2;
}

public void SelectTreeNodeJson(TreeNode node)
{
if (Main.activeFname != fname)
Expand All @@ -1012,7 +1050,7 @@ public void SelectTreeNodeJson(TreeNode node)
{
nodeStartPos = NodePosInJsonDoc(node);
if (GetDocumentType() == DocumentType.REGEX)
nodeEndPos = nodeStartPos + JsonParser.UTF8BytesInCSVRepr(jnode.ValueOrToString(), csvDelim, csvQuote);
nodeEndPos = nodeStartPos + LengthOfStringInRegexMode(jnode, nodeStartPos, csvDelim, csvQuote);
else
nodeEndPos = Main.EndOfJNodeAtPos(nodeStartPos, Npp.editor.GetLength());
}
Expand Down Expand Up @@ -1050,8 +1088,8 @@ public void SelectTreeNodeJsonChildren(TreeNode node)
break;
}
string childstr = child.ValueOrToString();
int utf8Len = JsonParser.UTF8BytesInCSVRepr(childstr, csvDelim, csvQuote);
int startPos = child.position;
int startPos = child.position + selectionStartPos;
int utf8Len = LengthOfStringInRegexMode(child, startPos, csvDelim, csvQuote);
int endPos = startPos + utf8Len;
if (endPos > startPos)
{
Expand Down
30 changes: 0 additions & 30 deletions JsonToolsNppPlugin/JSONTools/JsonParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -325,36 +325,6 @@ public static int ExtraUTF8BytesBetween(string inp, int start, int end)
return count;
}

/// <summary>
/// number of bytes in the UTF8-encoded CSV representation of string s (with delimiter delim and quote character quote)<br></br>
/// e.g. "fö\n'" would be represented as 'fö\n''' in a CSV file with '\'' as quote character, so UTF8BytesInCSVRepr("fö\n'", ',', '\'') would return 8.<br></br>
/// if delim is '\x00', return the UTF8 bytecount of s
/// </summary>
/// <param name="delim">CSV delimiter (or '\x00' for non-CSV file)</param>
/// <param name="quote">CSV quote character</param>
/// <returns></returns>
public static int UTF8BytesInCSVRepr(string s, char delim, char quote)
{
if (delim == 0)
return Encoding.UTF8.GetByteCount(s);
int quoteCount = 0;
bool delimOrNewline = false;
int byteCount = s.Length;
for (int ii = 0; ii < s.Length; ii++)
{
char c = s[ii];
if (c == '\r' || c == '\n' || c == delim)
delimOrNewline = true;
else if (c == quote)
quoteCount++;
else
byteCount += ExtraUTF8Bytes(c);
}
if (delimOrNewline || quoteCount > 0)
byteCount += 2 + quoteCount; // strings containing newlines, quotes or delimiters must be wrapped in quotes, and quote chars in quoted strings must be doubled up
return byteCount;
}

/// <summary>
/// Set the parser's state to severity, unless the state was already higher.<br></br>
/// If the severity is above the parser's loggerLevel:<br></br>
Expand Down
84 changes: 43 additions & 41 deletions JsonToolsNppPlugin/JSONTools/JsonTabularize.cs
Original file line number Diff line number Diff line change
Expand Up @@ -802,41 +802,69 @@ public JArray BuildTable(JNode obj, Dictionary<string, object> schema, string ke
/// </summary>
/// <param name="s"></param>
/// <param name="delim"></param>
/// <param name="quote_char"></param>
/// <param name="quote"></param>
/// <returns></returns>
public static void ApplyQuotesIfNeeded(StringBuilder sb, string s, char delim, char quote_char)
public static void ApplyQuotesIfNeeded(StringBuilder sb, string s, char delim, char quote)
{
if (s.IndexOfAny(new char[] {delim, '\r', '\n', quote_char}) >= 0)
if (s.IndexOfAny(new char[] {delim, '\r', '\n', quote}) >= 0)
{
sb.Append(quote_char);
sb.Append(quote);
for (int ii = 0; ii < s.Length; ii++)
{
char c = s[ii];
sb.Append(c);
if (c == quote_char)
sb.Append(quote_char);
if (c == quote)
sb.Append(quote);
}
sb.Append(quote_char);
sb.Append(quote);
}
else sb.Append(s);
}

public string TableToCsv(JArray table, char delim = ',', char quote_char = '"', string[] header = null, bool bools_as_ints = false, string newline = "\n")
public static void CsvStringToSb(StringBuilder sb, JNode jnode, char delim, char quote, bool boolsAsInts)
{
string val;
switch (jnode.type)
{
case Dtype.STR:
val = (string)jnode.value;
break; // only apply quotes if internal delims, quotes, or newlines
case Dtype.DATE:
val = ((DateTime)jnode.value).ToString("yyyy-MM-dd");
break;
case Dtype.DATETIME:
val = ((DateTime)jnode.value).ToString("yyyy-MM-dd hh:mm:ss");
break;
case Dtype.NULL:
return; // nulls should be empty entries
case Dtype.BOOL:
sb.Append((bool)jnode.value
? (boolsAsInts ? "1" : "true")
: (boolsAsInts ? "0" : "false"));
return;
default:
val = jnode.ToString();
break;
}
ApplyQuotesIfNeeded(sb, val, delim, quote);
}

public string TableToCsv(JArray table, char delim = ',', char quote = '"', string newline = "\n", string[] header = null, bool boolsAsInts = false)
{
// allow the user to supply their own column order. If they don't, just alphabetically sort colnames
if (header == null)
{
HashSet<string> all_keys = new HashSet<string>();
HashSet<string> allKeys = new HashSet<string>();
foreach (JNode child in table.children)
{
foreach (string key in ((JObject)child).children.Keys)
{
all_keys.Add(key);
allKeys.Add(key);
}
}
header = new string[all_keys.Count];
header = new string[allKeys.Count];
int ii = 0;
foreach (string key in all_keys)
foreach (string key in allKeys)
{
header[ii++] = key;
}
Expand All @@ -846,7 +874,7 @@ public string TableToCsv(JArray table, char delim = ',', char quote_char = '"',
for (int ii = 0; ii < header.Length; ii++)
{
string col = header[ii];
ApplyQuotesIfNeeded(sb, col, delim, quote_char);
ApplyQuotesIfNeeded(sb, col, delim, quote);
if (ii < header.Length - 1) sb.Append(delim);
}
sb.Append(newline);
Expand All @@ -856,40 +884,14 @@ public string TableToCsv(JArray table, char delim = ',', char quote_char = '"',
for (int ii = 0; ii < header.Length; ii++)
{
string col = header[ii];
if (!orow.children.TryGetValue(col, out JNode val))
if (!orow.children.TryGetValue(col, out JNode jnode))
{
// sometimes the row might have a missing key.
// in that case, just leave an empty entry, and add a delimiter if needed.
if (ii < header.Length - 1) sb.Append(delim);
continue;
}
switch (val.type)
{
case Dtype.STR:
ApplyQuotesIfNeeded(sb, (string)val.value, delim, quote_char);
break; // only apply quotes if internal delim
case Dtype.DATE:
sb.Append(((DateTime)val.value).ToString("yyyy-MM-dd"));
break;
case Dtype.DATETIME:
sb.Append(((DateTime)val.value).ToString("yyyy-MM-dd hh:mm:ss"));
break;
case Dtype.NULL:
break; // nulls should be empty entries
case Dtype.BOOL:
if (bools_as_ints)
{
sb.Append((bool)val.value ? "1" : "0");
}
else
{
sb.Append((bool)val.value ? "true" : "false");
}
break;
default:
sb.Append(val.ToString()); // everything else gets quote chars
break;
}
CsvStringToSb(sb, jnode, delim, quote, boolsAsInts);
if (ii < header.Length - 1) sb.Append(delim);
}
sb.Append(newline);
Expand Down
9 changes: 5 additions & 4 deletions JsonToolsNppPlugin/JSONTools/RemesPathFunctions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2865,7 +2865,7 @@ public static JNode ToCsv(List<JNode> args)
int ii = 0;
foreach (JNode ochild in o.children.Values)
{
JsonTabularizer.ApplyQuotesIfNeeded(sb, ochild.ValueOrToString(), delim, quote);
JsonTabularizer.CsvStringToSb(sb, ochild, delim, quote, false);
ii++;
if (ii < o.Length)
sb.Append(delim);
Expand All @@ -2881,15 +2881,16 @@ public static JNode ToCsv(List<JNode> args)
{
if (!(x is JArray a))
{
JsonTabularizer.ApplyQuotesIfNeeded(sb, x.ValueOrToString(), delim, quote);
JsonTabularizer.CsvStringToSb(sb, x, delim, quote, false);
sb.Append(newline);
nColumns = 1;
return true;
}
nColumns = a.children.Count;
var children = a.children;
nColumns = children.Count;
for (int ii = 0; ii < nColumns; ii++)
{
JsonTabularizer.ApplyQuotesIfNeeded(sb, a.children[ii].ValueOrToString(), delim, quote);
JsonTabularizer.CsvStringToSb(sb, children[ii], delim, quote, false);
if (ii < a.Length - 1)
sb.Append(delim);
}
Expand Down
4 changes: 2 additions & 2 deletions JsonToolsNppPlugin/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@
// Build Number
// Revision
//
[assembly: AssemblyVersion("5.8.0.14")]
[assembly: AssemblyFileVersion("5.8.0.14")]
[assembly: AssemblyVersion("5.8.0.15")]
[assembly: AssemblyFileVersion("5.8.0.15")]
20 changes: 10 additions & 10 deletions JsonToolsNppPlugin/Tests/JsonTabularizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ public static bool Test()
// TEST CSV CREATION
JsonParser fancyParser = new JsonParser(LoggerLevel.JSON5);

var csv_testcases = new (string inp, string desired_out, char delim, char quote_char, string[] header, bool bools_as_ints, string eol)[]
var csvTestcases = new (string inp, string desired_out, char delim, char quote_char, string[] header, bool bools_as_ints, string eol)[]
{
( "[{\"a\": 1, \"b\": \"a\"}, {\"a\": 2, \"b\": \"b\"}]", "a,b\r\n1,a\r\n2,b\r\n", ',', '"', null, false, "\r\n" ),
( "[{\"a\": 1, \"b\": \"a\"}, {\"a\": 2, \"b\": \"b\"}]", "a,b\r1,a\r2,b\r", ',', '"', null, false, "\r" ),
Expand Down Expand Up @@ -576,40 +576,40 @@ public static bool Test()
',', '"', null, false, "\r\n"
),
};
foreach ((string inp, string desired_out, char delim, char quote_char, string[] header, bool bools_as_ints, string eol) in csv_testcases)
foreach ((string inp, string desired_out, char delim, char quote, string[] header, bool boolsAsInts, string eol) in csvTestcases)
{
ii++;
JNode table = fancyParser.Parse(inp);
string result = "";
string head_str = header == null ? "null" : '[' + string.Join(", ", header) + ']';
string escapedEol = JNode.StrToString(eol, true);
string message_without_desired = $"With default strategy, expected TableToCsv({inp}, '{delim}', '{quote_char}', {head_str}, {escapedEol})\nto return\n";
string base_message = $"{message_without_desired}{desired_out}\n";
int msg_len = Encoding.UTF8.GetByteCount(desired_out) + 1 + message_without_desired.Length;
string messageWithoutDesired = $"With default strategy, expected TableToCsv({inp}, '{delim}', '{quote}', {head_str}, {escapedEol})\nto return\n";
string baseMessage = $"{messageWithoutDesired}{desired_out}\n";
int msgLen = Encoding.UTF8.GetByteCount(desired_out) + 1 + messageWithoutDesired.Length;
try
{
result = tabularizer.TableToCsv((JArray)table, delim, quote_char, header, bools_as_ints, eol);
result = tabularizer.TableToCsv((JArray)table, delim, quote, eol, header, boolsAsInts);
int result_len = Encoding.UTF8.GetByteCount(result);
try
{
if (!desired_out.Equals(result))
{
tests_failed++;
Npp.editor.AppendText(msg_len + 17 + result_len + 1, $"{base_message}Instead returned\n{result}\n");
Npp.editor.AppendText(msgLen + 17 + result_len + 1, $"{baseMessage}Instead returned\n{result}\n");
}
}
catch (Exception ex)
{
tests_failed++;
int ex_len = ex.ToString().Length;
Npp.editor.AppendText(msg_len + 17 + result_len + 21 + ex_len + 1,
$"{base_message}Instead returned\n{result}\nand threw exception\n{ex}\n");
Npp.editor.AppendText(msgLen + 17 + result_len + 21 + ex_len + 1,
$"{baseMessage}Instead returned\n{result}\nand threw exception\n{ex}\n");
}
}
catch (Exception ex)
{
tests_failed++;
Npp.AddLine($"{base_message}Instead threw exception\n{ex}");
Npp.AddLine($"{baseMessage}Instead threw exception\n{ex}");
}
}
// TEST NO_RECURSION setting
Expand Down
6 changes: 3 additions & 3 deletions JsonToolsNppPlugin/Tests/RemesPathTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ public static bool Test()
new Query_DesiredResult("str(csv_regex(5, , , `'`))", JNode.StrToString(JNode.StrToString(ArgFunction.CsvRowRegex(5, quote:'\''), true), true)),
new Query_DesiredResult("to_csv(@.foo)", "\"0,1,2\\r\\n3.0,4.0,5.0\\r\\n6.0,7.0,8.0\\r\\n\""),
new Query_DesiredResult("to_csv(j`[{\"a\\\\tb\": 1, \"c\": null}, {\"a\\\\tb\": -7.5, \"c\": \"bar\\\\n'baz'\"}]`, `\\t`, `\\r`, `'`)",
"\"'a\\tb'\\tc\\r1\\tnull\\r-7.5\\t'bar\\n''baz'''\\r\""),
"\"'a\\tb'\\tc\\r1\\t\\r-7.5\\t'bar\\n''baz'''\\r\""), // null values are converted to empty strings
new Query_DesiredResult("to_csv(@.foo[:2], `.`, `\\n`, `#`)", "\"0.1.2\\n#3.0#.#4.0#.#5.0#\\n\""),
new Query_DesiredResult("to_csv(@.foo[1], `.`)", "\"\\\"3.0\\\"\\r\\n\\\"4.0\\\"\\r\\n\\\"5.0\\\"\""),
new Query_DesiredResult("to_csv(@.foo[:][0],,`\\n`)", "\"0\\n3.0\\n6.0\""),
Expand Down Expand Up @@ -581,9 +581,9 @@ public static bool Test()
"1\\r" +
".3\\r" +
"5\\r" +
"-7.2`" +
"$-7.2$`" +
", 1, `^`, `\\r`, `$`, d, 0)",
"[{\"foo\": 1}, {\"foo\": 0.3}, {\"foo\": 5}, {\"foo\": -7.2}]"),
"[{\"foo\": 1}, {\"foo\": 0.3}, {\"foo\": 5}, {\"foo\": \"-7.2\"}]"),
// ====================== s_fa function for parsing regex search results as string arrays or arrays of arrays of strings =========
// 2 capture groups (2nd optional), parse the first group as number
new Query_DesiredResult("s_fa(`1. foo boo\\r\\n" +
Expand Down
Loading

0 comments on commit 9328eb5

Please sign in to comment.