From 210e0f067f2091ead7d6aa881053de7769ee9339 Mon Sep 17 00:00:00 2001
From: Michael Kay <mike@saxonica.com>
Date: Fri, 6 Dec 2024 21:46:42 +0000
Subject: [PATCH] Normalize line endings in CSV prior to parsing

---
 .../src/function-catalog.xml                  | 54 +++++++++----------
 .../src/xpath-functions.xml                   | 28 ++++++----
 2 files changed, 42 insertions(+), 40 deletions(-)
diff --git a/specifications/xpath-functions-40/src/function-catalog.xml b/specifications/xpath-functions-40/src/function-catalog.xml
index 34d704160..f6f1fd196 100644
--- a/specifications/xpath-functions-40/src/function-catalog.xml
+++ b/specifications/xpath-functions-40/src/function-catalog.xml
@@ -25867,7 +25867,8 @@ return json-to-xml($json, $options)]]></eg>
                <fos:meaning>The character used to delimit rows within
                   the CSV string. An instance of
                   <code>xs:string</code> whose length is exactly one.
-                  Defaults to a single newline character (<char>U+000A</char>).</fos:meaning>
+                  Defaults to a single newline character (<char>U+000A</char>).
+                  Note that this is tested after line endings are normalized.</fos:meaning>
                <fos:type>xs:string</fos:type>
                <fos:default>char('\n')</fos:default>
             </fos:option>
@@ -25891,7 +25892,7 @@ return json-to-xml($json, $options)]]></eg>
                   </fos:value>
                </fos:values>
             </fos:option>
-            <fos:option key="normalize-newlines">
+            <!--<fos:option key="normalize-newlines">
                <fos:meaning>Determines whether CR and CRLF character sequences
                   are treated as equivalent to NL characters.</fos:meaning>
                <fos:type>xs:boolean</fos:type>
@@ -25906,7 +25907,7 @@ return json-to-xml($json, $options)]]></eg>
                   whether or not NL is used as the row delimiter.
                   </fos:value>
                </fos:values>
-            </fos:option>
+            </fos:option>-->
             <fos:option key="header">
                <fos:meaning>Determines whether the first row of the CSV should be treated as a list
                   of column names, or whether column names are being supplied by the caller. 
@@ -25978,11 +25979,10 @@ return json-to-xml($json, $options)]]></eg>
 
       </fos:errors>
       <fos:notes>
-         <p>The default row delimiter is a single newline character <char>U+000A</char>. If the content
-         is read using the <code>unparsed-text</code> function, alternative line endings
-         such as <code>CR</code> and <code>CRLF</code> will have been normalized to a single
-         newline. In other cases, this normalization can be achieved by setting the 
-         <code>normalize-newlines</code> option.</p>
+         <p>The default row delimiter is a single newline character <char>U+000A</char>. 
+            Alternative line endings
+         such as <code>CR</code> and <code>CRLF</code> will already have been normalized to a single
+         newline.</p>
          <p>All fields are returned as <code>xs:string</code> values.</p>
          <p>Quoted fields in the input are returned without the quotes.</p>
          <p>For more discussion of the returned data, see <specref ref="csv-to-xdm-mapping"/>.</p>
@@ -26234,7 +26234,9 @@ return (
       <fos:rules>
          
          <p>The <code>$value</code> argument is CSV data, as defined in <bibref ref="rfc4180"/>, in the form of an
-            <code>xs:string</code> value. The function parses this string.
+            <code>xs:string</code> value. The function parses this string,
+            after normalizing newlines so that <char>U+000D</char> and (<char>U+000D</char>, <char>U+000A</char>)
+            sequences are converted to <char>U+000A</char>.
             The result of the function is a sequence of arrays of strings, that is
             <code>array(xs:string)*</code>; each array represents one row of the CSV input.</p>
          
@@ -26289,7 +26291,7 @@ return (
                   </fos:value>
                </fos:values>
             </fos:option>
-            <fos:option key="normalize-newlines">
+            <!--<fos:option key="normalize-newlines">
                <fos:meaning>Determines whether CR and CRLF character sequences
                   are treated as equivalent to NL characters.</fos:meaning>
                <fos:type>xs:boolean</fos:type>
@@ -26304,7 +26306,7 @@ return (
                      whether or not NL is used as the row delimiter.
                   </fos:value>
                </fos:values>
-            </fos:option>
+            </fos:option>-->
          </fos:options>
 
          <p>An empty field is represented by a zero-length string. An empty field is deemed to exist
@@ -26322,8 +26324,8 @@ return (
          contain no rows; while if <code>$value</code> consists of a single row delimiter,
          it is considered to contain a single blank row. The presence or
          absence of a final row delimiter generally has no effect on the result,
-         except in the situation described in the previous paragraph where it causes a
-         blank row to exist.</p>
+         except when it appears at the start of the input, in which case it causes a
+         single blank row to exist.</p>
          
          
       </fos:rules>
@@ -26339,12 +26341,10 @@ return (
             <code>quote-character</code>.</p>
       </fos:errors>
       <fos:notes>
-         <p>The default row delimiter is a single newline character <char>U+000A</char>. If the content
-            is read using the <code>unparsed-text</code> function, alternative line endings
-            such as <code>CR</code> and <code>CRLF</code> will have been normalized to a single
-            newline. In other cases, this normalization can be achieved by setting the
-            option <code>normalize-newlines</code>. This option does not affect CR or CRLF
-            sequences occurring within quoted fields.</p>
+         <p>The default row delimiter is a single newline character <char>U+000A</char>. 
+            Alternative line endings
+            such as <code>CR</code> and <code>CRLF</code> will already have been normalized to a single
+            newline. </p>
          <p>All fields are returned as <code>xs:string</code> values.</p>
          <p>Quoted fields in the input are returned without the quotes.</p>
          <p>The first row is not treated specially.</p>
@@ -26405,8 +26405,7 @@ return (
 return csv-to-arrays(
   `name,city{ $CRLF }` ||
   `Bob,Berlin{ $CRLF }` ||
-  `Alice,Aachen{ $CRLF }`, 
-  { "normalize-newlines": true() }
+  `Alice,Aachen{ $CRLF }`
 )</eg></fos:expression>
                <fos:result><eg>[ "name", "city" ],
 [ "Bob", "Berlin" ],
@@ -26618,7 +26617,7 @@ return document {
             <p>With defaults for delimiters and quotes, recognizing headers:</p>
             <fos:test use="escaped-crlf-3 csv-string-2">
                <fos:expression><eg>csv-to-xml($csv-string, 
-         { "header": true(), "normalize-newlines": true() })</eg></fos:expression>
+         { "header": true() })</eg></fos:expression>
                <fos:result normalize-space="true"><eg><![CDATA[
 <csv xmlns="http://www.w3.org/2005/xpath-functions">
   <columns>
@@ -26651,8 +26650,7 @@ return document {
                <fos:expression><eg>csv-to-xml(
   $csv-uneven-cols, 
   { "header": true(), 
-    "select-columns": (2, 1, 4), 
-    "normalize-newlines": true() 
+    "select-columns": (2, 1, 4)
   }
 )</eg></fos:expression>
                <fos:result normalize-space="true"><eg><![CDATA[
@@ -26688,7 +26686,7 @@ return document {
             <fos:test use="escaped-crlf-3 uneven-cols-csv-string-2">
                <fos:expression><eg>csv-to-xml(
   $csv-uneven-cols, 
-  { "header": true(), "normalize-newlines": true() }
+  { "header": true() }
 )</eg></fos:expression>
                <fos:result normalize-space="true"><eg><![CDATA[
 <csv xmlns="http://www.w3.org/2005/xpath-functions">
@@ -26737,8 +26735,7 @@ return document {
                <fos:expression><eg>csv-to-xml(
   $csv-uneven-cols, 
   { "header": true(), 
-    "trim-rows": true(), 
-    "normalize-newlines": true() 
+    "trim-rows": true()
   }
 )</eg></fos:expression>
                <fos:result normalize-space="true"><eg><![CDATA[
@@ -26791,8 +26788,7 @@ return document {
                <fos:expression><eg>csv-to-xml(
   $csv-uneven-cols, 
   { "header": true(), 
-    "select-columns": 1 to 6, 
-    "normalize-newlines": true() 
+    "select-columns": 1 to 6
   }
 )</eg></fos:expression>
                <fos:result normalize-space="true"><eg><![CDATA[
diff --git a/specifications/xpath-functions-40/src/xpath-functions.xml b/specifications/xpath-functions-40/src/xpath-functions.xml
index 4c81f6c00..c9ffce0a0 100644
--- a/specifications/xpath-functions-40/src/xpath-functions.xml
+++ b/specifications/xpath-functions-40/src/xpath-functions.xml
@@ -6927,7 +6927,12 @@ correctly in all browsers, depending on the system configuration.</emph></p>-->
             <ulist>
                <item><p>This specification uses the term <term>row</term> where RFC 4180 uses
                <term>record</term>.</p></item>
-               <item><p>Row delimiters other than <code>CRLF</code> are recognized.</p></item>
+               <item><p>Line endings are normalized: specifically, the character sequences
+               <char>U+000D</char>, or <char>U+000D</char> followed by <char>U+000A</char>, are converted
+                  to a single <char>U+000A</char> character. This applies whether or not the line ending
+               appears within a quoted string, and whether or not <char>U+000A</char> is the chosen
+               row delimiter.</p></item>
+               <item><p>Row delimiters other than newline are recognized.</p></item>
                <item><p>Field delimiters other than comma (<code>","</code>) are recognized.</p></item>
                <item><p>Quote characters other than the double quotation mark (<code>'"'</code>)
                are recognized.</p></item>
@@ -6963,18 +6968,18 @@ correctly in all browsers, depending on the system configuration.</emph></p>-->
 
                   <p>Rows in CSV files are typically delimited with CRLF (<char>U+000D</char>, <char>U+000A</char>), 
                      LF (<char>U+000A</char>), or CR (<char>U+000D</char>) line endings, 
-                     although RFC 4180 specifies CRLF. By contrast, the <code>fn:unparsed-text</code>
-                     function normalizes these line endings to LF (<char>U+000A</char>).
-                     The CSV parsing functions therefore use LF by default. An option is available
-                     to normalize line endings so that CR and CRLF are converted to <char>U+000A</char> (except
-                     when they appear in quote fields). This option is off by default, because
-                     line ending normalization will usually have been carried out earlier: for 
-                     example, the <code>fn:unparsed-text</code> function does it automatically.
+                     although RFC 4180 specifies CRLF. The CSV parsing functions 
+                     normalize these line endings to LF (<char>U+000A</char>).
+                     They therefore use LF as the default row delimiter.
                   </p>
                
                  
                
-                  <p>The last row in the file may or may not be followed by a row delimiter.</p>
+                  <p>The last row in the file may or may not be followed by a row delimiter.
+                  An empty file is treated as containing zero rows, while a file consisting solely
+                  of a row delimiter is treated as containing one empty row. In all other cases,
+                  a file that does not end with a row delimiter is treated as if a row delimiter were
+                  added at the end.</p>
 
                   <p>Fields in CSV are frequently delimited with a comma. Other field
                      delimiters are useful, for
@@ -6982,7 +6987,7 @@ correctly in all browsers, depending on the system configuration.</emph></p>-->
                      chosen field delimiter is then often <char>U+003B</char>
                      or <char>U+0009</char>.</p>
 
-               <p>The column delimiter defaults to <char>U+002C</char>. 
+               <p>The column delimiter thus defaults to <char>U+002C</char>. 
                      The value may be
                      any single Unicode character. An error is raised if the 
                      <code>column-delimiter</code> option is set to a multi-character string.</p>
@@ -6991,7 +6996,8 @@ correctly in all browsers, depending on the system configuration.</emph></p>-->
             <div3 id="csv-field-quoting">
                <head>Field quoting</head>
 
-               <p>CSVs, as specified in <bibref ref="rfc4180"/>, require that fields be wrapped with a quote character if they
+               <p>CSVs, as specified in <bibref ref="rfc4180"/>, require that fields be wrapped 
+                  with a quote character if they
                   contain either the row or column delimiter. For example:</p>
 
                <eg>"A single field, containing a comma","another field containing CRLF