From 6f759557d7f6160680d2d0f9ea6f41e4b9959f28 Mon Sep 17 00:00:00 2001
From: liquidaty <info@liquidaty.com>
Date: Tue, 22 Nov 2022 10:05:18 -0800
Subject: [PATCH] cleanup and wasm movemask (#81)

* add SSE2 movemask
* add wasm_movemask and update js benchmarks
* update npm message for EAGAIN readsync error using sync read + stdin on wasm
---
 README.md                                     |  8 +-
 app/Makefile                                  |  2 +-
 configure                                     | 55 +++++++++--
 examples/js/Makefile                          | 93 +++++++++++++------
 examples/js/README.md                         | 22 +++--
 examples/js/js/foot.js                        | 35 +++++--
 examples/js/npm/test/count-papaparse.js       | 58 ++++++++++++
 examples/js/npm/test/select_all-csv-parser.js | 11 ++-
 examples/js/npm/test/select_all-papaparse.js  | 67 +++++++++++++
 examples/js/npm/test/select_all.js            |  8 +-
 examples/lib/README.md                        |  2 -
 src/Makefile                                  |  2 +-
 src/zsv_internal.c                            | 23 ++++-
 13 files changed, 312 insertions(+), 74 deletions(-)
 create mode 100644 examples/js/npm/test/count-papaparse.js
 create mode 100644 examples/js/npm/test/select_all-papaparse.js

diff --git a/README.md b/README.md
index 810316c1..a11665c7 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ that implements the expected
   [app/benchmark/README.md](app/benchmark/README.md)
 * Low memory usage (regardless of how big your data is) and size footprint for
   both lib (~20k) and CLI executable (< 1MB)
-* Easy to use as a library in a few lines of code
+* Easy to use as a library in a few lines of code, via either pull or push parsing
 * Includes the `zsv` CLI with the following built-in commands:
   * `select`, `count`, `sql` query, `desc`ribe, `flatten`, `serialize`, `2json`,
     `2db`, `stack`, `pretty`, `2tsv`, `jq`, `prop`, `rm`
@@ -157,10 +157,6 @@ choco.exe install zsv -source .\zsv-amd64-windows-mingw.nupkg
 choco.exe uninstall zsv
 ```
 
-**NOTE**: Windows build has a runtime dependency on `libwinpthread-1.dll`.
-Please download it from here (https://wikidll.com/mingw-w64/libwinpthread-1-dll)
-according to your Windows version and place it with `zsv` executable.
-
 #### Node
 
 The zsv parser library is available for node:
@@ -256,7 +252,7 @@ zsv sql my_population_data.csv "select * from data where population > 100000"
 
 ### Using the API
 
-Basic examples of using the API are in [examples/lib/README.md](examples/lib/README.md).
+Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md).
 
 An example of using the API, compiled to wasm and called via Javascript,
 is in [examples/js/README.md](examples/js/README.md).
diff --git a/app/Makefile b/app/Makefile
index de2259f7..e540a31d 100755
--- a/app/Makefile
+++ b/app/Makefile
@@ -128,7 +128,7 @@ ifneq ($(findstring emcc,$(CC)),) # emcc
     LDFLAGS+=-pthread
   endif
 else # not emcc
-  CFLAGS+= ${CFLAGS_AVX}
+  CFLAGS+= ${CFLAGS_AVX} ${CFLAGS_SSE}
   LDFLAGS+=-lpthread # Linux explicitly requires
 endif
 UTILS=$(addprefix ${BUILD_DIR}/objs/utils/,$(addsuffix .o,${UTILS1}))
diff --git a/configure b/configure
index 045f366d..c0581e11 100755
--- a/configure
+++ b/configure
@@ -16,6 +16,7 @@ Configuration:
 Optional configuration:
   --minimal=yes           do not include extra features (default=no)
   --arch=ARCH             use -march=ARCH. Set to 'none' for none, else defaults to 'native'
+  --jq-prefix=JQ_PREFIX   specify directory containing lib/libjq and include/jq.h
 
 Installation directories:
   --prefix=PREFIX         main installation prefix [\$PREFIX or /usr/local]
@@ -30,6 +31,7 @@ Optional features:
   --try-avx512            use avx512 instructions, if available [no]
   --force-avx2            force compile with (no CPU check) or without -mavx2 [auto]
   --force-avx             force compile with (no CPU check) or without -mavx [auto]
+  --force-sse2            force compile with (no CPU check) or without -msse2 [auto]
   --enable-lto            compile with LTO (works with some but not all platforms/compilers) [no]
   --enable-whole-program  compile without -fwhole-program even if no -flto [yes]
   --enable-pie            build with position independent executables [auto]
@@ -45,6 +47,8 @@ Some influential environment variables:
   CFLAGS                  C compiler flags [-Os -pipe ...]
   LDFLAGS                 Linker flags
 
+  CROSS_COMPILING=no      Set to yes to disable auto-detect compilation flags
+
 Use these variables to override the choices made by configure.
 
 EOF
@@ -244,7 +248,7 @@ trysharedldflag () {
 }
 
 # Beginning of actual script
-
+CROSS_COMPILING=no
 CFLAGS_AUTO=
 CFLAGS_TRY=
 LDFLAGS_AUTO=
@@ -254,7 +258,7 @@ if [ "$CONFIGFILE" = "" ]; then
     CONFIGFILE=config.mk
 fi
 
-if [ "$ARCH" = "" ]; then
+if [ "$ARCH" = "" ] && [ "$CROSS_COMPILING" = "no" ]; then
     ARCH=native
 fi
 
@@ -290,6 +294,9 @@ MINIMAL=no
 
 TRY_LTO=no
 TRY_WHOLE_PROGRAM=auto
+FORCE_AVX2=auto
+FORCE_AVX=auto
+FORCE_SSE2=auto
 
 help=yes
 usepie=auto
@@ -323,6 +330,9 @@ for arg ; do
         --force-avx|--force-avx=yes) FORCE_AVX=yes;;
         --force-avx=no) FORCE_AVX=no;;
 
+        --force-sse2|--force-sse2=yes) FORCE_SSE2=yes;;
+        --force-sse2=no) FORCE_SSE2=no;;
+
         --enable-lto|--enable-lto=yes) TRY_LTO=yes;;
         --enable-lto|--enable-lto=auto) TRY_LTO=auto;;
         --disable-lto|--enable-lto=no) TRY_LTO=no;;
@@ -517,18 +527,40 @@ tryflag CFLAGS -ffunction-sections
 tryflag CFLAGS -fdata-sections
 
 CFLAGS_AVX=
-if [ "$FORCE_AVX2" = "yes" ] ; then
+
+HAVE_AVX=
+if [ "$FORCE_AVX2" = "no" ]; then
+    tryflag CFLAGS -mno-avx2
+elif [ "$FORCE_AVX2" = "yes" ] ; then
     CFLAGS_AVX=-mavx2
-    trycpusupport avx2 || echo "warning: avx2 forced but not supported on native CPU"
-elif [ "$FORCE_AVX2" != "no" ] ; then
+    if [ "$CROSS_COMPILING" = "no" ] ; then
+        trycpusupport avx2 || echo "warning: avx2 forced but not supported on native CPU"
+    fi
+elif [ "$FORCE_AVX2" = "auto" ] && [ "$CROSS_COMPILING" = "no" ] ; then
     trycpusupport avx2 && CFLAGS_AVX=-mavx2
 fi
 
 if [ "$FORCE_AVX" = "yes" ] ; then
     CFLAGS_AVX=-mavx || echo "warning: avx forced but not supported on native CPU"
-elif [ "$FORCE_AVX" != "no" ] && [ "$CFLAGS_AVX" = "" ] ; then
+elif [ "$FORCE_AVX" = "auto" ] && [ "$CFLAGS_AVX" = "" ] && [ "$CROSS_COMPILING" = "no" ] ; then
     trycpusupport avx && CFLAGS_AVX=-mavx
 fi
+if [ "$FORCE_AVX" = "no" ]; then
+    tryflag CFLAGS -mno-avx
+fi
+
+if [ "$FORCE_SSE2" = "no" ]; then
+    tryflag CFLAGS -mno-sse2
+elif [ "$FORCE_SSE2" = "yes" ] ; then
+    CFLAGS_SSE=-msse2
+    if [ "$CROSS_COMPILING" = "no" ] ; then
+        trycpusupport sse2 || echo "warning: sse2 forced but not supported on native CPU"
+    fi
+elif [ "$FORCE_SSE2" = "auto" ] && [ "$CROSS_COMPILING" = "no" ] ; then
+    if [ "$CFLAGS_SSE" = "" ] && [ "$CROSS_COMPILING" = "no" ] ; then
+        trycpusupport sse2 && tryflag CFLAGS_SSE -msse2
+    fi
+fi
 
 HAVE_LTO=0
 if [ "$TRY_LTO" = "yes" ]; then
@@ -557,7 +589,10 @@ tryflag CFLAGS_OPT -fvisibility=hidden
 tryldflag LDFLAGS_AUTO -Wl,--gc-sections
 
 if [ "$ARCH" != "none" ] ; then
-    tryldflag LDFLAGS_OPT -march=$ARCH
+    if ! tryflag CFLAGS -march=$ARCH ; then
+        echo "Flag -march=$ARCH failed!"
+        exit 1
+    fi
 fi
 tryldflag LDFLAGS_OPT -ldl
 
@@ -602,7 +637,7 @@ if [ "$usetermcap" = "yes" ] || [ "$usetermcap" = "auto" ] ; then
             fi
 fi
 
-if [ "$JQ_PREFIX" != "" ] && [ "$ARCH" = "native" ]; then
+if [ "$JQ_PREFIX" != "" ] && [ "$CROSS_COMPILING" = "no" ] ; then
     echo "checking --prefix-jq ${JQ_PREFIX}"
     if ! tryldflag LDFLAGS_JQ -ljq -L${JQ_PREFIX}/lib ; then
         echo "Error: Failed to compile with -ljq and -L${JQ_PREFIX}/lib"
@@ -676,9 +711,11 @@ CFLAGS_LTO = $CFLAGS_LTO
 LDFLAGS_AUTO = $LDFLAGS_AUTO
 
 HAVE_AVX512=$HAVE_AVX512
-CFLAGS_AVX_512=$CFLAGS_AVX_512
 
+CFLAGS_AVX_512=$CFLAGS_AVX_512
 CFLAGS_AVX=$CFLAGS_AVX
+CFLAGS_SSE=$CFLAGS_SSE
+
 CFLAGS_DEBUG = -U_FORTIFY_SOURCE -UNDEBUG -O0 -g -Wall -Wextra -Wno-missing-field-initializers -Wno-unused-parameter # -g3 -ggdb
 LDFLAGS_DEBUG = -U_FORTIFY_SOURCE -UNDEBUG -O0 -g # -g3 -ggdb
 CFLAGS_PIC = $CFLAGS_PIC
diff --git a/examples/js/Makefile b/examples/js/Makefile
index c3f1953d..d7bc8ea9 100644
--- a/examples/js/Makefile
+++ b/examples/js/Makefile
@@ -41,7 +41,7 @@ INDEX=${BUILD_DIR}/index.html
 EMJS=${BUILD_DIR}/zsv.em.js
 WASM=${BUILD_DIR}/zsv.em.wasm
 
-CFLAGS+= ${CFLAGS_PIC} -s ALLOW_MEMORY_GROWTH=1 -s EXPORTED_RUNTIME_METHODS="['setValue','addFunction','removeFunction','writeArrayToMemory']" -s RESERVED_FUNCTION_POINTERS=4 -s EXPORTED_FUNCTIONS="['_free','_malloc']"
+CFLAGS+= ${CFLAGS_PIC} -s ALLOW_MEMORY_GROWTH=1 -s EXPORTED_RUNTIME_METHODS="['setValue','addFunction','removeFunction','writeArrayToMemory']" -s RESERVED_FUNCTION_POINTERS=4 -s EXPORTED_FUNCTIONS="['_free','_malloc']" -sASSERTIONS
 
 ifeq ($(DEBUG),1)
   CFLAGS += ${CFLAGS_DEBUG}
@@ -64,14 +64,14 @@ TEST_PASS=echo "${COLOR_BLUE}$@: ${COLOR_GREEN}Passed${COLOR_NONE}"
 TEST_FAIL=(echo "${COLOR_BLUE}$@: ${COLOR_RED}Failed!${COLOR_NONE}" && exit 1)
 #####
 
-.PHONY: help all run clean prep node setup benchmark count_compare
+.PHONY: help all run clean prep node setup benchmark count_compare select_compare
 
 help:
 	@echo "make [build|run|node|test|clean]"
 	@echo "by default, minified code is generated, which requires running the below once:"
 	@echo "  make setup"
 	@echo "alternatively, to generate non-minified code, use NO_MINIFY=1:"
-	@echo "  make NO_MINIFY=1 [build|run|node|test]"
+	@echo "  make NO_MINIFY=1 [build|run|node|test|benchmark]"
 
 build: ${BROWSER_JS} ${STATIC}
 	@echo Built ${BROWSER_JS}
@@ -91,7 +91,7 @@ test: npm/test/select_all.js node
 	@mkdir -p build/test
 	@cp -p $< node/
 	@echo "Running test (example) program \`node node/select_all.js ../../data/test/desc.csv\`"
-	@(cd node && node select_all.js ../../../data/test/desc.csv > ../build/test/out.json 2> ../build/test/out.err1)
+	@(cd node && ${NODE} select_all.js ../../../data/test/desc.csv > ../build/test/out.json 2> ../build/test/out.err1)
 	@sed 's/[0-9.]*ms//g' < build/test/out.err1 > build/test/out.err
 	@cmp build/test/out.err npm/test/out.err
 	@cmp build/test/out.json npm/test/out.json && ${TEST_PASS} || ${TEST_FAIL}
@@ -121,6 +121,7 @@ ifeq ($(NO_MINIFY),1)
 	@mv $@.tmp.js $@
 else
 	@uglifyjs $@.tmp.js -c -m > $@
+	rm $@.tmp.js
 endif
 
 ### node package build
@@ -142,6 +143,7 @@ ifeq ($(NO_MINIFY),1)
 	@mv $@.tmp.js $@
 else
 	@uglifyjs $@.tmp.js -c -m > $@
+	rm $@.tmp.js
 endif
 
 setup:
@@ -152,50 +154,83 @@ node: ${NODE_WASM} ${NODE_INDEX} ${NODE_PKG_FILES}
 
 
 #### node benchmark
-BENCHMARK_INPUT=${THIS_MAKEFILE_DIR}/../../app/benchmark/worldcitiespop_mil-sc.csv
+BENCHMARK_INPUT=${THIS_MAKEFILE_DIR}/../../app/benchmark/worldcitiespop_mil.csv
+
+NODE=node --experimental-wasm-modules
 
 benchmark: node count_compare select_compare
 
 count_compare:
 	@cp -p npm/test/count*.js node/
-	@cd node && (npm list | grep csv-parser) && echo "csv-parser already installed" || npm install csv-parser
+	@cd node && (npm list | grep csv-parser) && echo "csv-parser already installed" || npm install csv-parser papaparse
 
 	@echo "zsv count"
-	head -5000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
 
-	head -500000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/count.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count.js 2>&1 | head -1
 
 
 	@echo "csv-parser count"
-	head -5000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-csv-parser.js 2>&1 | head -1
+
+	@echo "papaparse count"
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
 
-	head -500000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/count-csv-parser.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/count-papaparse.js 2>&1 | head -1
 
 select_compare:
 	@cp -p npm/test/select_all*.js node/
-	@cd node && (npm list | grep csv-parser) && echo "csv-parser already installed" || npm install csv-parser
+	@cd node && (npm list | grep csv-parser) && echo "csv-parser already installed" || npm install csv-parser papaparse
 
 	@echo "zsv select_all"
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js '' '[0,2]' 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js '' '[0,2]' 2>&1 | head -1
 
-	head -500000 ${BENCHMARK_INPUT} | node node/select_all.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/select_all.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js 2>&1 | head -1
 
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js '' '[0,2]' 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js '' '[0,2]' 2>&1 | head -1
+
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all.js 2>&1 | head -1
 
 	@echo "csv-parser select_all"
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all-csv-parser.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all-csv-parser.js 2>&1 | head -1
-	head -5000 ${BENCHMARK_INPUT} | node node/select_all-csv-parser.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js '' '[0,2]' 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js '' '[0,2]' 2>&1 | head -1
+
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js 2>&1 | head -1
+
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js '' '[0,2]' 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js '' '[0,2]' 2>&1 | head -1
+
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-csv-parser.js 2>&1 | head -1
+
+	@echo "papaparse select_all"
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js '' '[0,2]' 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js '' '[0,2]' 2>&1 | head -1
+
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js 2>&1 | head -1
+	head -5000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js 2>&1 | head -1
+
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js '' '[0,2]' 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js '' '[0,2]' 2>&1 | head -1
 
-	head -500000 ${BENCHMARK_INPUT} | node node/select_all-csv-parser.js 2>&1 | head -1
-	head -500000 ${BENCHMARK_INPUT} | node node/select_all-csv-parser.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js 2>&1 | head -1
+	head -500000 ${BENCHMARK_INPUT} | ${NODE} node/select_all-papaparse.js 2>&1 | head -1
diff --git a/examples/js/README.md b/examples/js/README.md
index 6d6ab16d..933aab0c 100644
--- a/examples/js/README.md
+++ b/examples/js/README.md
@@ -44,17 +44,18 @@ this example does not require that libzsv is already installed
 
 ## Performance
 
-Running ZSV lib from Javascript is still experimental and is not yet fully optimized. Some performance challenges are
-unique to web assembly + Javascript, especially where a lot of string data
+Running ZSV lib from Javascript is still experimental and is not yet fully optimized.
+Some performance challenges rae particular to web assembly + Javascript, e.g. where a lot of string data
 is being passed between Javascript and the library (see e.g. https://hacks.mozilla.org/2019/08/webassembly-interface-types/).
 
-Furthermore, it is unlikely that zsv-lib can approach its full performance potential
-until emscripten (or gcc) [can provide a SIMD-powered movemask function](https://github.com/WebAssembly/simd/pull/201). Until then, libzsv in emscripten resorts to the "slow"
-movemask, which does have a significant impact.
+However, initial results are promising:
 
-Current testing suggests that on small files (under 1 MB), zsv-lib is 30-75% faster than, for example, the `csv-parser` library. However, on larger files,
-due to the aforementioned Javascript/wasm memory overhead and lack of
-SIMD movemask, it can be more than 50% slower than `csv-parser`.
+* Running only "count", zsv-lib is ~90%+ faster than `csv-parser` and `papaparse`
+* The more cell data that is fetched, the more this advantage diminishes due to the aforementioned Javascript/wasm memory overhead.
+  Our benchmarking suggests that if the entire row's data is fetched, performance is about on par with both csv-parser and papaparse.
+  If only a portion is fetched, performance is about the same for papaparse, and faster than csv-parser (how much faster
+  being roughly proportional to the difference between count (~90% faster) and the
+  amount of total data fetched)
 
 ## All the build commands
 
@@ -68,6 +69,11 @@ make clean
 
 Add MINIFY=1 to any of the above to generate minified code
 
+To run benchmark tests:
+```
+make benchmark
+```
+
 To see all make options:
 ```
 make
diff --git a/examples/js/js/foot.js b/examples/js/js/foot.js
index ca0b4143..cda5d57a 100644
--- a/examples/js/js/foot.js
+++ b/examples/js/js/foot.js
@@ -121,29 +121,37 @@
     let row = [];
 
     // convert bytes to JS data
-    for(let i = 0; i < count; i++)
-      row.push(z.getCell(i));
+    if(z.outputIndexes) {
+      for(let i = 0; i < z.outputIndexes.length; i++)
+        row.push(z.getCell(z.outputIndexes[i]));
+    } else {
+      for(let i = 0; i < count; i++)
+        row.push(z.getCell(i));
+    }
     z.rowHandler(row, z.ctx, z);
   }
 
   function globalReadFunc(buff, n, m, ix) {
     let z = activeParsers[ix];
     let sz = n * m;
-    let jsbuff = new Uint8Array(Module.HEAP8.buffer, buff, sz);
+    if(sz != z.last_sz) {
+      z.jsbuff = new Uint8Array(Module.HEAP8.buffer, buff, sz);
+      z.sz = sz;
+    }
     let bytes;
     try {
-      bytes = fs.readSync(z.fd, jsbuff, 0, sz);
+      bytes = fs.readSync(z.fd, z.jsbuff, 0, sz);
     } catch(e) {
       if(e.code == 'EAGAIN') {
         for(let i = 0; i < 100; i++)
           try {
-            bytes = fs.readSync(z.fd, jsbuff, 0, sz);
+            bytes = fs.readSync(z.fd, z.jsbuff, 0, sz);
             break;
           } catch(e1) {
           }
       }
       if(!bytes) {
-        console.error('Use buffered async read to fix this!', e.toString());
+        console.error('EAGAIN error: for stdin on linux/mac, node does not support sync-- use async instead', e.toString());
         throw new Error(e)
       }
     }
@@ -163,14 +171,16 @@
 
   return {
     /**
-     * create a new parser
+     * create a push parser
      *
      * @param rowHandler callback with signature (row, ctx, parser)
      * @param ctx        a caller-defined value that will be passed to the row handler
      * @param options
-     *        - rowData  if false, row data will not be passed to the row handler
-     *        - async    readableStream handle
-     *        - end      function(ctx, parser) to attach to stream end event
+     *        - rowData       if false, row data will not be passed to the row handler
+     *        - sync          synchronous readableStream handle
+     *        - async         async readableStream handle
+     *        - end           function(ctx, parser) to attach to stream end event
+     *        - outputIndexes array of 0-based indexes to output
      */
     new: function(rowHandler, ctx, options) {
       let zsv = _zsv_new(null);
@@ -191,6 +201,7 @@
         z = {
           zsv: zsv,
           rowHandler: rowHandler,
+          outputIndexes: options.outputIndexes,
           cellCount: cellCount,
           getCell: getCell,
           buff: null,
@@ -281,6 +292,10 @@
         activeParser_count++;
         _zsv_set_row_handler(zsv, options.rowData === false ? globalRowHandlerNoDatap : globalRowHandlerWithDatap);
         _zsv_set_context(zsv, z.ix);
+
+        if(options.sync)
+          while(_zsv_parse_more(zsv) == 0) // _zsv_status_ok
+            ;
         return o;
       }
     },
diff --git a/examples/js/npm/test/count-papaparse.js b/examples/js/npm/test/count-papaparse.js
new file mode 100644
index 00000000..073745d4
--- /dev/null
+++ b/examples/js/npm/test/count-papaparse.js
@@ -0,0 +1,58 @@
+const process= require('node:process');
+const { PerformanceObserver, performance } = require('node:perf_hooks');
+const fs = require('fs');
+const papa = require('papaparse');
+
+/**
+ * Example using libzsv to parse CSV input and execute a custom row handler function as each row is parsed
+ */
+
+/**
+ * We will use a separate context for each parser, which is a pattern that allows us to run multiple
+ * parsers at the same time independently, although this example only runs one at a time
+ */
+function createContext() {
+  return {
+    rowcount: 0,                  // how many rows we've parsed so far
+    startTime: performance.now(), // when the run was started
+    data: [],                     // object to hold all data parsed thus far
+    bytesRead: 0                  // how many bytes we've parsed thus far
+  };
+}
+
+/**
+ * Define the steps to take after all parsing has completed
+ */
+function finish(ctx) {
+  let endTime = performance.now()   /* check the time */
+
+  /* output a message describing the parse volume and performance */
+  console.error('Parsed ' + ctx.rowcount +
+                ' rows in ' + (endTime - ctx.startTime) + 'ms\n' +
+                'You can view the parsed data in your browser dev tools console (rt-click and select Inspect)');
+  
+  /**
+   * output the parsed data (we could have also done this while we parsed, and not
+   * bothered to accumulate it, to save memory)
+   */
+  console.log(ctx.data);
+}
+
+
+let ctx = createContext();
+
+let opts = {};
+
+opts.step = function(results, parser) {
+  ctx.rowcount++;
+  results.data = [];
+  return results;
+};
+opts.complete = function(results) {
+  finish(ctx);
+}
+
+
+/* read stdin if we have no arguments, else the first argument */
+const readStream = process.argv.length < 3 || !process.argv[2] ? process.stdin : fs.createReadStream(process.argv[2])
+papa.parse(readStream, opts);
diff --git a/examples/js/npm/test/select_all-csv-parser.js b/examples/js/npm/test/select_all-csv-parser.js
index 7a14af55..c09cfcb2 100644
--- a/examples/js/npm/test/select_all-csv-parser.js
+++ b/examples/js/npm/test/select_all-csv-parser.js
@@ -40,11 +40,18 @@ function finish(ctx) {
 
 
 let ctx = createContext();
+
+let opts = {};
+if(process.argv.length > 3 && process.argv[3]) {
+  let indexes = JSON.parse(process.argv[3]);
+  opts.mapHeaders = ({ header, index }) => (indexes.indexOf(index) > -1 ? header : null);
+}
+
 /* read stdin if we have no arguments, else the first argument */
-const readStream = process.argv.length < 3 ? process.stdin : fs.createReadStream(process.argv[2])
+const readStream = process.argv.length < 3 || !process.argv[2] ? process.stdin : fs.createReadStream(process.argv[2])
 readStream.on('error', (error) => console.log(error.message));
 readStream
-  .pipe(csv())
+  .pipe(csv(opts))
   .on('data', (row) => {
     ctx.rowcount++;
     ctx.data.push(row);
diff --git a/examples/js/npm/test/select_all-papaparse.js b/examples/js/npm/test/select_all-papaparse.js
new file mode 100644
index 00000000..b9397848
--- /dev/null
+++ b/examples/js/npm/test/select_all-papaparse.js
@@ -0,0 +1,67 @@
+const process= require('node:process');
+const { PerformanceObserver, performance } = require('node:perf_hooks');
+const fs = require('fs');
+const papa = require('papaparse');
+
+/**
+ * Example using libzsv to parse CSV input and execute a custom row handler function as each row is parsed
+ */
+
+/**
+ * We will use a separate context for each parser, which is a pattern that allows us to run multiple
+ * parsers at the same time independently, although this example only runs one at a time
+ */
+function createContext() {
+  return {
+    rowcount: 0,                  // how many rows we've parsed so far
+    startTime: performance.now(), // when the run was started
+    data: [],                     // object to hold all data parsed thus far
+    bytesRead: 0                  // how many bytes we've parsed thus far
+  };
+}
+
+/**
+ * Define the steps to take after all parsing has completed
+ */
+function finish(ctx) {
+  let endTime = performance.now()   /* check the time */
+
+  /* output a message describing the parse volume and performance */
+  console.error('Parsed ' + ctx.rowcount +
+                ' rows in ' + (endTime - ctx.startTime) + 'ms\n' +
+                'You can view the parsed data in your browser dev tools console (rt-click and select Inspect)');
+  
+  /**
+   * output the parsed data (we could have also done this while we parsed, and not
+   * bothered to accumulate it, to save memory)
+   */
+  console.log(ctx.data);
+}
+
+
+let ctx = createContext();
+
+let opts = {};
+
+if(process.argv.length > 3 && process.argv[3]) {
+  let indexes = JSON.parse(process.argv[3]);
+  opts.step = function(results, parser) {
+    ctx.rowcount++;
+    ctx.data.push(indexes.map(ix => results.data[ix]));
+    return results;
+  };
+  opts.complete = function(results) {
+    finish(ctx);
+  }
+} else {
+  opts.complete = function(results) {
+    ctx.rowcount = results.data.length;
+    ctx.data = results.data;
+    finish(ctx);
+  }
+}
+
+
+/* read stdin if we have no arguments, else the first argument */
+const readStream = process.argv.length < 3 || !process.argv[2] ? process.stdin : fs.createReadStream(process.argv[2])
+papa.parse(readStream, opts);
diff --git a/examples/js/npm/test/select_all.js b/examples/js/npm/test/select_all.js
index 9c02ffd8..469ce3ce 100644
--- a/examples/js/npm/test/select_all.js
+++ b/examples/js/npm/test/select_all.js
@@ -57,8 +57,12 @@ zsvParser.runOnLoad(function() {
   let ctx = createContext();
 
   /* read stdin if we have no arguments, else the first argument */
-  const readFile = process.argv.length < 3 ? process.stdin : fs.createReadStream(process.argv[2]);
+  const readFile = process.argv.length < 3 || !process.argv[2] ? process.stdin : fs.createReadStream(process.argv[2]);
+
+  let outputIndexes;
+  if(process.argv.length > 3 && process.argv[3])
+    outputIndexes = JSON.parse(process.argv[3]);
 
   /* initialize parser */
-  let parser = zsvParser.new(rowHandler, ctx, { async: readFile, end: finish });
+  let parser = zsvParser.new(rowHandler, ctx, { async: readFile, end: finish, outputIndexes: outputIndexes });
 });
diff --git a/examples/lib/README.md b/examples/lib/README.md
index 8a1e0af9..19349cab 100644
--- a/examples/lib/README.md
+++ b/examples/lib/README.md
@@ -4,8 +4,6 @@
 
 This directory contains the following examples:
 
-
-
 | file     | description |
 | -- | -- |
 | [pull.c](pull.c) | Same as simple.c, but uses pull parsing via `zsv_pull_next_row()`|
diff --git a/src/Makefile b/src/Makefile
index 7b884176..cc58d902 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,7 +41,7 @@ ifneq ($(findstring emcc,$(CC)),) # emcc
   ZSV_EXTRAS=1
   NO_THREADING=1
 else # not emcc
-  CFLAGS+= ${CFLAGS_AVX}
+  CFLAGS+= ${CFLAGS_AVX} ${CFLAGS_SSE}
 endif
 
 ifeq ($(NO_THREADING),1)
diff --git a/src/zsv_internal.c b/src/zsv_internal.c
index 9279bb03..5856e230 100644
--- a/src/zsv_internal.c
+++ b/src/zsv_internal.c
@@ -372,7 +372,12 @@ static inline enum zsv_status cell_and_row_dl(struct zsv_scanner *scanner, unsig
   only for each corresponding non-zero highest-bit value in the vector)
 */
 
-# if defined(__ARM_NEON) || defined(__ARM_NEON__)
+# ifdef __EMSCRIPTEN__
+
+#include <wasm_simd128.h>
+#define movemask_pseudo(x) wasm_i8x16_bitmask(x)
+
+# elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 #  include <arm_neon.h>
 static inline zsv_mask_t movemask_pseudo(zsv_uc_vector v) {
   // see https://stackoverflow.com/questions/11870910/
@@ -389,17 +394,27 @@ static inline zsv_mask_t movemask_pseudo(zsv_uc_vector v) {
   vst1q_lane_u8((uint8_t*)&mask + 1, (uint8x16_t)imask, 8);
   return mask;
 }
+
+# elif defined(__SSE2__)
+
+typedef char zsv_c_vector __attribute__ ((vector_size (VECTOR_BYTES)));
+#  define movemask_pseudo(x) __builtin_ia32_pmovmskb128((zsv_c_vector)x)
+
 # else
+
+// slow path
+
 static inline zsv_mask_t movemask_pseudo(zsv_uc_vector v) {
-  // to do: see https://github.com/WebAssembly/simd/issues/131 for wasm
   zsv_mask_t mask = 0, tmp = 1;
   for(size_t i = 0; i < sizeof(zsv_uc_vector); i++) {
-    mask += (v[i] ? tmp : 0);
+    mask |= (v[i] ? tmp : 0);
     tmp <<= 1;
   }
+
   return mask;
 }
-# endif // __ARM_NEON
+
+# endif // __EMSCRIPTEN__
 #endif // ndef movemask_pseudo
 
 # include "vector_delim.c"