From 421f84a113e15bb161b99435c101219348b2449a Mon Sep 17 00:00:00 2001 From: liquidaty Date: Fri, 19 Aug 2022 15:08:24 -0700 Subject: [PATCH] =?UTF-8?q?initial=20commit:=20only=20applies=20to=20selec?= =?UTF-8?q?t=20cmd=20(to=20do:=20test,=20apply=20to=20all=E2=80=A6=20(#50)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move header offset/depth generic options (-R,--skip-head / -d,--header-row-span) into parser and make available for all commands, instead of only for select command * other miscellaneous cleanup --- app/Makefile | 6 +- app/benchmark/Makefile | 4 +- app/builtin/help.c | 2 + .../test/expected/zsvext-test-3.out | 4 + app/jq.c | 9 +- app/jq_internal.c | 27 +-- app/jq_internal.h | 4 +- app/select.c | 63 ++----- app/test/Makefile | 46 ++--- app/utils/arg.c | 56 ++++-- app/utils/string.c | 2 +- include/zsv/common.h | 14 +- src/zsv.c | 11 +- src/zsv_internal.c | 173 +++++++++++++++++- 14 files changed, 283 insertions(+), 138 deletions(-) diff --git a/app/Makefile b/app/Makefile index e73cd652..ba463cea 100644 --- a/app/Makefile +++ b/app/Makefile @@ -141,8 +141,8 @@ endif ZSV=$(BINDIR)/zsv${EXE} -SOURCES= echo count select 2json serialize flatten pretty stack desc 2tsv sql 2db -CLI_SOURCES=select desc count pretty sql flatten 2json 2tsv serialize stack 2db +SOURCES= echo count select 2tsv 2json serialize flatten pretty stack desc sql 2db +CLI_SOURCES=select desc count 2tsv pretty sql flatten 2json serialize stack 2db ifneq ($(LDFLAGS_JQ),) SOURCES+= jq @@ -210,7 +210,7 @@ help: @echo "which will build and test all apps, or to build/test a single app:" @echo " ${MAKE} test-xx" @echo "where xx is any of:" - @echo " echo count select 2json serialize flatten pretty stack desc 2tsv sql 2db" + @echo " echo count select 2tsv 2json serialize flatten pretty stack desc sql 2db" @echo "" install: ${ZSV} diff --git a/app/benchmark/Makefile b/app/benchmark/Makefile index 2e3d8c89..6f308356 100644 --- a/app/benchmark/Makefile +++ b/app/benchmark/Makefile @@ -28,11 +28,11 @@ endif CCBN=$(shell basename ${CC}) ZSVBIN=../../build/${BUILD_SUBDIR}/${CCBN}/bin/zsv_ -QUICK= +QUICK=1 help: @echo "To run all tests (set QUICK to skip mlr and csvcut):" - @echo " make all [QUICK=1]" + @echo " make all [QUICK=0]" @echo " make CLI" CLI: ZSVBIN="zsv " diff --git a/app/builtin/help.c b/app/builtin/help.c index f6bcce3a..e99405ac 100644 --- a/app/builtin/help.c +++ b/app/builtin/help.c @@ -43,6 +43,8 @@ static int main_help(int argc, const char *argv[]) { " -t,--tab-delim: set column delimiter to tab", " -O,--other-delim : set column delimiter to specified character", " -q,--no-quote: turn off quote handling", + " -R,--skip-head : skip specified number of initial rows", + " -d,--header-row-span : apply header depth (rowspan) of n", " -v,--verbose: verbose output", "", "Commands:", diff --git a/app/ext_example/test/expected/zsvext-test-3.out b/app/ext_example/test/expected/zsvext-test-3.out index a3e075d3..2825def1 100644 --- a/app/ext_example/test/expected/zsvext-test-3.out +++ b/app/ext_example/test/expected/zsvext-test-3.out @@ -25,6 +25,8 @@ Options common to all commands: -t,--tab-delim: set column delimiter to tab -O,--other-delim : set column delimiter to specified character -q,--no-quote: turn off quote handling + -R,--skip-head : skip specified number of initial rows + -d,--header-row-span : apply header depth (rowspan) of n -v,--verbose: verbose output Commands: @@ -68,6 +70,8 @@ Options common to all commands: -t,--tab-delim: set column delimiter to tab -O,--other-delim : set column delimiter to specified character -q,--no-quote: turn off quote handling + -R,--skip-head : skip specified number of initial rows + -d,--header-row-span : apply header depth (rowspan) of n -v,--verbose: verbose output Commands: diff --git a/app/jq.c b/app/jq.c index 50cee9cb..869f9277 100644 --- a/app/jq.c +++ b/app/jq.c @@ -2,11 +2,6 @@ #include #include "jq_internal.h" - -size_t fwrite1(void *restrict FILE_ptr, const void *restrict buff, size_t len) { - return fwrite(buff, len, 1, FILE_ptr); -} - #include "jq_internal.c" #ifndef APPNAME @@ -82,11 +77,11 @@ int MAIN(int argc, const char *argv[]) { if(!err) { void (*jqfunc)(jv, void *) = to_csv ? jv_to_csv : jv_to_json_func; struct jv_to_json_ctx ctx; - ctx.write1 = fwrite1; + ctx.write1 = zsv_jq_fwrite1; ctx.ctx = f_out; ctx.flags = JV_PRINT_PRETTY | JV_PRINT_SPACE1; - void *jqctx = to_csv ? f_out : &ctx; + void *jqctx = to_csv ? (void *)f_out : (void *)&ctx; enum zsv_jq_status jqstat; zsv_jq_handle zjq = zsv_jq_new(jqfilter, jqfunc, jqctx, &jqstat); if(jqstat != zsv_jq_status_ok) { diff --git a/app/jq_internal.c b/app/jq_internal.c index fccc19b6..6602dbe8 100644 --- a/app/jq_internal.c +++ b/app/jq_internal.c @@ -118,21 +118,13 @@ static void jv_to_csv_aux(jv value, FILE *f, int inside_string) { jv_free(value); } -/* -void jv_to_json_w_ctx(jv value, void *ctx) { - struct jv_to_json_ctx *c = ctx; - jv_dumpf(value, ctx->out, c->flags); +size_t zsv_jq_fwrite1(void *restrict FILE_ptr, const void *restrict buff, size_t len) { + return fwrite(buff, len, 1, FILE_ptr); } -void jv_to_json(jv value, void *file) { - FILE *f = file; - jv_dumpf(value, f, 0); -} -*/ - void jv_to_json_func(jv value, void *ctx) { struct jv_to_json_ctx *data = ctx; - if(data->write1 == fwrite1) + if(data->write1 == zsv_jq_fwrite1) jv_dumpf(value, data->ctx, data->flags); else { // jv_dump_string is memory-inefficient @@ -184,7 +176,6 @@ void jv_to_csv(jv value, void *file) { jv_free(value); } -/// static void jv_to_txt_aux(jv value, FILE *f) { f = f ? f : stdout; if(!jv_print_scalar(jv_copy(value), 0, f, 0)) { @@ -262,8 +253,6 @@ void jv_to_lqjq(jv value, void *h) { jv_free(jv_s); } -/// - struct zsv_jq_data { void *jq; struct jv_parser *parser; @@ -398,7 +387,6 @@ static int zsv_jq_process(jq_state *jq, return ret; } - void jv_to_bool(jv value, void *char_result) { char *c = char_result; switch(jv_get_kind(value)) { @@ -416,12 +404,3 @@ void jv_to_bool(jv value, void *char_result) { } jv_free(value); } - -static const unsigned char *strrchru(const unsigned char *s, char c) { - return (const unsigned char *)strrchr((const char *)s, c); -} - -static size_t zsv_jq_parse1(void *restrict h, const void *restrict s, size_t len) { - enum zsv_jq_status stat = zsv_jq_parse(h, s, len); - return (size_t) stat; -} diff --git a/app/jq_internal.h b/app/jq_internal.h index d71e82b7..99aa117c 100644 --- a/app/jq_internal.h +++ b/app/jq_internal.h @@ -11,8 +11,10 @@ enum zsv_jq_status { zsv_jq_status_error }; +size_t zsv_jq_fwrite1(void *restrict FILE_ptr, const void *restrict buff, size_t len); + struct jv_to_json_ctx { - size_t (*write1)(void *restrict ctx, const void *restrict buff, size_t len); // e.g. common/write1 + size_t (*write1)(void *restrict ctx, const void *restrict buff, size_t len); // e.g. zsv_jq_fwrite1 void *ctx; // e.g. FILE * int flags; // passed on to jv_dumpf / jv_dump_string }; diff --git a/app/select.c b/app/select.c index 799b3f14..11f46e39 100644 --- a/app/select.c +++ b/app/select.c @@ -83,7 +83,6 @@ struct zsv_select_data { unsigned int output_col_index; // num of cols printed in current row size_t file_row_count; - size_t header_rows_processed; // output columns: const char **col_argv; @@ -112,10 +111,8 @@ struct zsv_select_data { double sample_pct; - unsigned char skip_rows; - unsigned char skip_rows_orig; unsigned char sample_every_n; - unsigned char header_depth; + size_t data_rows_limit; size_t skip_data_rows; @@ -522,34 +519,27 @@ static void zsv_select_header_row(void *ctx) { if(data->cancelled) return; - if(data->skip_rows > 0) - data->skip_rows--; - else { - data->header_rows_processed++; - unsigned int cols = zsv_column_count(data->parser); - unsigned int max_header_ix = 0; - for(unsigned int i = 0; i < cols; i++) { - struct zsv_cell cell = zsv_get_cell(data->parser, i); - cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len); - if(i < data->opts.max_columns) { - zsv_select_append_spaced_word(&data->header_names[i], cell.str, cell.len); - if(cell.len) - max_header_ix = i+1; - } + unsigned int cols = zsv_column_count(data->parser); + unsigned int max_header_ix = 0; + for(unsigned int i = 0; i < cols; i++) { + struct zsv_cell cell = zsv_get_cell(data->parser, i); + cell.str = zsv_select_cell_clean(data, cell.str, cell.quoted, &cell.len); + if(i < data->opts.max_columns) { + zsv_select_append_spaced_word(&data->header_names[i], cell.str, cell.len); + if(cell.len) + max_header_ix = i+1; } + } - // in case we want to make this an option later - char trim_trailing_columns = 1; - if(!trim_trailing_columns) - max_header_ix = cols; + // in case we want to make this an option later + char trim_trailing_columns = 1; + if(!trim_trailing_columns) + max_header_ix = cols; - if(max_header_ix > data->header_name_count) - data->header_name_count = max_header_ix; + if(max_header_ix > data->header_name_count) + data->header_name_count = max_header_ix; - // if this was the last row in the header, finish header processing - if(data->header_rows_processed >= data->header_depth) - zsv_select_header_finish(data); - } + zsv_select_header_finish(data); } #define ZSV_SELECT_MAX_COLS_DEFAULT 1024 @@ -586,8 +576,6 @@ const char *zsv_select_usage_msg[] = " --distinct: skip subsequent occurrences of columns with the same name", " --merge: merge subsequent occurrences of columns with the same name, outputting first non-null value", // --rename: like distinct, but instead of removing cols with dupe names, renames them, trying _ for n up to max cols - " -R, --skip-head : skip specified number of rows", - " -D, --skip-data : skip the specified number of data rows", " -e : char to replace embedded lineend. if none provided, embedded lineends are preserved", " If the provided string begins with 0x, it will be interpreted as the hex representation of a string", " -x : exclude the indicated column. can be specified more than once", @@ -658,8 +646,6 @@ int MAIN(int argc, const char *argv[]) { data.opts = zsv_get_default_opts(); struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts(); - data.header_depth = 1; - int col_index_arg_i = 0; const char *insert_header_row = NULL; for(int arg_i = 1; !err && arg_i < argc; arg_i++) { @@ -733,13 +719,8 @@ int MAIN(int argc, const char *argv[]) { else if(!strcmp(argv[arg_i], "--whitespace-clean-no-newline")) { data.clean_white = 1; data.whitspace_clean_flags = 1; - } else if(!strcmp(argv[arg_i], "-W") || !strcmp(argv[arg_i], "--no-trim")) + } else if(!strcmp(argv[arg_i], "-W") || !strcmp(argv[arg_i], "--no-trim")) { data.no_trim_whitespace = 1; - else if(!strcmp(argv[arg_i], "-d") || !strcmp(argv[arg_i], "--header-row-span")) { - if(!(arg_i + 1 < argc && atoi(argv[arg_i+1]) >= 0 && atoi(argv[arg_i+1]) < 256)) - err = zsv_printerr(1, "%s option value invalid: should be integer between 1 and 255; got %s", argv[arg_i], arg_i + 1 < argc ? argv[arg_i+1] : ""); - else - data.header_depth = (unsigned char)atoi(argv[++arg_i]); } else if(!strcmp(argv[arg_i], "--header-row")) { arg_i++; if(!(arg_i < argc)) @@ -768,12 +749,6 @@ int MAIN(int argc, const char *argv[]) { err = zsv_printerr(1, "%s option value invalid: should be positive integer; got %s", argv[arg_i], arg_i + 1 < argc ? argv[arg_i+1] : ""); else data.data_rows_limit = atoi(argv[++arg_i]) + 1; - } else if(!strcmp(argv[arg_i], "-R") || !strcmp(argv[arg_i], "--skip-head")) { - ++arg_i; - if(!(arg_i < argc && atoi(argv[arg_i]) >= 0 && atoi(argv[arg_i]) < 256)) - err = zsv_printerr(1, "-R option value invalid: should be positive integer smaller than 256"); - else - data.skip_rows = data.skip_rows_orig = atoi(argv[arg_i]); } else if(!strcmp(argv[arg_i], "-D") || !strcmp(argv[arg_i], "--skip-data")) { ++arg_i; if(!(arg_i < argc && atoi(argv[arg_i]) >= 0)) diff --git a/app/test/Makefile b/app/test/Makefile index 7f482f21..6afe1aa6 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -62,7 +62,7 @@ COLOR_PINK=\033[1;35m TEST_PASS=echo "${COLOR_BLUE}$@: ${COLOR_GREEN}Passed${COLOR_NONE}" TEST_FAIL=(echo "${COLOR_BLUE}$@: ${COLOR_RED}Failed!${COLOR_NONE}" && exit 1) -TEST_NAME=echo "${COLOR_PINK}$@: ${COLOR_NONE}" +TEST_INIT=mkdir -p ${TMP_DIR} && echo "${COLOR_PINK}$@: ${COLOR_NONE}" ARGS-sql='select [Loan Number] from data' @@ -95,17 +95,17 @@ CLI: @echo "Testing CLI..." @make CLI1=1 test -n | sed 's/\/[^ ]*\/bin\/zsv_/zsv /g' | sh -test: ${TMP_DIR} ${TESTS} +test: ${TESTS} #${TMP_DIR} ${TESTS} -${TMP_DIR}: - @mkdir -p ${TMP_DIR} +# ${TMP_DIR}: +# @mkdir -p ${TMP_DIR} .SECONDARY: worldcitiespop_mil.csv .PHONY: help test test-% test-stack clean test-echo : ${BUILD_DIR}/bin/zsv_echo${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/loans_1.csv ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} @@ -115,28 +115,28 @@ worldcitiespop_mil.csv: test-count: test-count-1 test-count-2 test-count-1: ${BUILD_DIR}/bin/zsv_count${EXE} worldcitiespop_mil.csv - @${TEST_NAME} + @${TEST_INIT} @cat worldcitiespop_mil.csv | ${PREFIX} $< ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-count-2: ${BUILD_DIR}/bin/zsv_count${EXE} ${TEST_DATA_DIR}/test/buffsplit_quote.csv - @${TEST_NAME} + @${TEST_INIT} @for x in 5000 5002 5004 5006 5008 5010 5013 5015 5017 5019 5021 5101 5105 5111 5113 5115 5117 5119 5121 5123 5125 5127 5129 5131 5211 5213 5215 5217 5311 5313 5315 5317 5413 5431 5433 5455 6133 ; do $< -r $$x ${TEST_DATA_DIR}/test/buffsplit_quote.csv ; done > ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-select: test-select-n test-select-6 test-select-7 test-select-8 test-select-9 test-select-quotebuff test-select-fixed-1 test-select-merge test-select-merge: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< --merge ${TEST_DATA_DIR}/test/select-merge.csv ${REDIRECT} ${TMP_DIR}/test-select-merge.out @${CMP} ${TMP_DIR}/test-select-merge.out expected/test-select-merge.out && ${TEST_PASS} || ${TEST_FAIL} test-select-quotebuff: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${THIS_MAKEFILE_DIR}/select-quotebuff-gen.sh | ${PREFIX} $< -B 4096 | sed 's/"/Q/g' | grep QQ >/dev/null && ${TEST_FAIL} || ${TEST_PASS} test-select-n: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/loans_1.csv -u "?" -R 4 -d 2 ${REDIRECT} ${TMP_DIR}/test-select.out @${CMP} ${TMP_DIR}/test-select.out expected/test-select.out && ${TEST_PASS} || ${TEST_FAIL} @${PREFIX} $< ${TEST_DATA_DIR}/test/embedded.csv -e 'X' ${REDIRECT} ${TMP_DIR}/test-select.2.out @@ -149,12 +149,12 @@ test-select-n: ${BUILD_DIR}/bin/zsv_select${EXE} @${CMP} ${TMP_DIR}/test-select.5.out expected/test-select.5.out && ${TEST_PASS} || ${TEST_FAIL} test-select-6: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/test/tab.txt -t ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-select-7: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/test/white.csv ${REDIRECT} ${TMP_DIR}/$@.out1 @${CMP} ${TMP_DIR}/$@.out1 expected/$@.out1 && ${TEST_PASS} || ${TEST_FAIL} @${PREFIX} $< --whitespace-clean ${TEST_DATA_DIR}/test/white.csv ${REDIRECT} ${TMP_DIR}/$@.out2 @@ -163,7 +163,7 @@ test-select-7: ${BUILD_DIR}/bin/zsv_select${EXE} @${CMP} ${TMP_DIR}/$@.out3 expected/$@.out3 && ${TEST_PASS} || ${TEST_FAIL} test-select-8: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/test/white_utf8.csv ${REDIRECT} ${TMP_DIR}/$@.out1 @${CMP} ${TMP_DIR}/$@.out1 expected/$@.out1 && ${TEST_PASS} || ${TEST_FAIL} @${PREFIX} $< --whitespace-clean ${TEST_DATA_DIR}/test/white_utf8.csv ${REDIRECT} ${TMP_DIR}/$@.out2 @@ -172,12 +172,12 @@ test-select-8: ${BUILD_DIR}/bin/zsv_select${EXE} @${CMP} ${TMP_DIR}/$@.out3 expected/$@.out3 && ${TEST_PASS} || ${TEST_FAIL} test-select-9: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/test/quoted3.csv -q ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-select-fixed-1: ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/fixed.csv --fixed 3,7,12,18,20,21,22 ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} @@ -185,17 +185,17 @@ test-select-fixed-1: ${BUILD_DIR}/bin/zsv_select${EXE} test-stack: test-stack1 test-stack2 test-stack1: ${BUILD_DIR}/bin/zsv_stack${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/stack[12].csv ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-stack2: ${BUILD_DIR}/bin/zsv_stack${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< ${TEST_DATA_DIR}/stack2-[12].csv ${REDIRECT} ${TMP_DIR}/$@.out @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-2tsv test-sql test-serialize test-flatten test-pretty : test-%: ${BUILD_DIR}/bin/zsv_%${EXE} - @${TEST_NAME} + @${TEST_INIT} @( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for $*") || \ (${PREFIX} $< ${ARGS-$*} < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) @@ -203,13 +203,13 @@ test-2tsv test-sql test-serialize test-flatten test-pretty : test-%: ${BUILD_DIR test-sql: test-sql2 test-sql3 test-sql2: ${BUILD_DIR}/bin/zsv_sql${EXE} - @${TEST_NAME} + @${TEST_INIT} @echo ${ARGS-sql} > ${TMP_DIR}/$@.sql @${PREFIX} $< '@'${TMP_DIR}/$@.sql ${TEST_DATA_DIR}/test/sql.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/test-sql.out && ${TEST_PASS} || ${TEST_FAIL} test-sql3: ${BUILD_DIR}/bin/zsv_sql${EXE} - @${TEST_NAME} + @${TEST_INIT} @${PREFIX} $< --join-indexes 8 ${TEST_DATA_DIR}/test/sql.csv ${TEST_DATA_DIR}/test/sql.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} @@ -218,7 +218,7 @@ ${BUILD_DIR}/bin/zsv_%${EXE}: make -C .. $@ CONFIGFILE=${CONFIGFILEPATH} DEBUG=${DEBUG} test-2db: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} worldcitiespop_mil.csv ${BUILD_DIR}/bin/zsv_2json${EXE} ${BUILD_DIR}/bin/zsv_select${EXE} - @${TEST_NAME} + @${TEST_INIT} @${BUILD_DIR}/bin/zsv_select${EXE} -L 25000 -N worldcitiespop_mil.csv | ${BUILD_DIR}/bin/zsv_2json${EXE} --database --index "country_ix on country" --unique-index "ux on [#]" > ${TMP_DIR}/$@.json @(${PREFIX} $< ${ARGS-$*} -o ${TMP_DIR}/$@.db --table data --overwrite < ${TMP_DIR}/test-2db.json ${REDIRECT1} ${TMP_DIR}/$@.out) @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} @@ -232,7 +232,7 @@ test-jq: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} @${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL} test-2json: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} ${BUILD_DIR}/bin/zsv_2db${EXE} ${BUILD_DIR}/bin/zsv_select${EXE} worldcitiespop_mil.csv - @${TEST_NAME} + @${TEST_INIT} @( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for $*") || \ (${PREFIX} $< ${ARGS-$*} < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) @@ -262,7 +262,7 @@ test-2json: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} ${BUILD_DIR}/bin/zsv_2db${EXE} test-desc: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} - @${TEST_NAME} + @${TEST_INIT} @( ( ! [ -s "${TEST_DATA_DIR}/test/$*.csv" ] ) && echo "No test input for $*") || \ (${PREFIX} $< -q < ${TEST_DATA_DIR}/test/$*.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ ${CMP} ${TMP_DIR}/$@.out expected/$@.out && ${TEST_PASS} || ${TEST_FAIL}) diff --git a/app/utils/arg.c b/app/utils/arg.c index 8cdda83c..aacc4b15 100644 --- a/app/utils/arg.c +++ b/app/utils/arg.c @@ -85,9 +85,9 @@ int zsv_args_to_opts(int argc, const char *argv[], argv_out[new_argc] = argv[new_argc]; #ifdef ZSV_EXTRAS - static const char *short_args = "BcrtOqvL"; + static const char *short_args = "BcrtOqvRdL"; #else - static const char *short_args = "BcrtOqv"; + static const char *short_args = "BcrtOqvRd"; #endif static const char *long_args[] = { @@ -98,6 +98,8 @@ int zsv_args_to_opts(int argc, const char *argv[], "other-delim", "no-quote", "verbose", + "skip-head", + "header-row-span", #ifdef ZSV_EXTRAS "limit-rows", #endif @@ -132,6 +134,8 @@ int zsv_args_to_opts(int argc, const char *argv[], case 'c': case 'r': case 'O': + case 'R': + case 'd': if(++i >= argc) err = fprintf(stderr, "Error: option %s requires a value\n", argv[i-1]); else if(arg == 'O') { @@ -144,30 +148,46 @@ int zsv_args_to_opts(int argc, const char *argv[], opts_out->delimiter = *val; } else { const char *val = argv[i]; - /* arg = 'B', 'c', 'r' or 'L' (ZSV_EXTRAS only) */ + /* arg = 'B', 'c', 'r', 'R', 'd', or 'L' (ZSV_EXTRAS only) */ long n = atol(val); + if(n < 0) + err = fprintf(stderr, "Error: option %s value may not be less than zero (got %li\n", val, n); #ifdef ZSV_EXTRAS - if(arg == 'L') { + else if(arg == 'L') { if(n < 1) err = fprintf(stderr, "Error: max rows may not be less than 1 (got %s)\n", val); else opts_out->max_rows = n; } else #endif - if(arg == 'B' && n < ZSV_MIN_SCANNER_BUFFSIZE) - err = fprintf(stderr, "Error: buff size may not be less than %u (got %s)\n", - ZSV_MIN_SCANNER_BUFFSIZE, val); - else if(arg == 'c' && n < 8) - err = fprintf(stderr, "Error: max column count may not be less than 8 (got %s)\n", val); - else if(arg == 'r' && n < ZSV_ROW_MAX_SIZE_MIN) - err = fprintf(stderr, "Error: max row size size may not be less than %u (got %s)\n", - ZSV_ROW_MAX_SIZE_MIN, val); - else if(arg == 'B') - opts_out->buffsize = n; - else if(arg == 'c') - opts_out->max_columns = n; - else if(arg == 'r') - opts_out->max_row_size = n; + if(arg == 'B') { + if(n < ZSV_MIN_SCANNER_BUFFSIZE) + err = fprintf(stderr, "Error: buff size may not be less than %u (got %s)\n", + ZSV_MIN_SCANNER_BUFFSIZE, val); + else + opts_out->buffsize = n; + } else if(arg == 'c') { + if(n < 8) + err = fprintf(stderr, "Error: max column count may not be less than 8 (got %s)\n", val); + else + opts_out->max_columns = n; + } else if(arg == 'r') { + if(n < ZSV_ROW_MAX_SIZE_MIN) + err = fprintf(stderr, "Error: max row size size may not be less than %u (got %s)\n", + ZSV_ROW_MAX_SIZE_MIN, val); + else + opts_out->max_row_size = n; + } else if(arg == 'd') { + if(n < 8 && n >= 0) + opts_out->header_span = n; + else + err = fprintf(stderr, "Error: header_span must be an integer between 0 and 8\n"); + } else if(arg == 'R') { + if(n >= 0) + opts_out->rows_to_skip = n; + else + err = fprintf(stderr, "Error: rows_to_skip must be >= 0\n"); + } } break; default: /* pass this option through */ diff --git a/app/utils/string.c b/app/utils/string.c index 6d72c2fd..566709ae 100644 --- a/app/utils/string.c +++ b/app/utils/string.c @@ -202,7 +202,7 @@ size_t zsv_strwhite(unsigned char *s, size_t len, unsigned int flags) { size_t zsv_strencode(unsigned char *s, size_t n, unsigned char replace) { size_t new_len = 0; int clen; - for(size_t i2 = 0; i2 < n; i2 += clen) { + for(size_t i2 = 0; i2 < n; i2 += (size_t)clen) { clen = ZSV_UTF8_CHARLEN(s[i2]); if(LIKELY(clen == 1)) s[new_len++] = s[i2]; diff --git a/include/zsv/common.h b/include/zsv/common.h index 88f47dfe..58d1ca53 100644 --- a/include/zsv/common.h +++ b/include/zsv/common.h @@ -163,6 +163,17 @@ struct zsv_opts { */ const char *insert_header_row; + /* + * number of rows that the header row spans. If 0 or 1, header is assumed to span 1 row + * otherwise, set to number > 1 to span multiple rows + */ + unsigned int header_span; + + /* + * number of rows to skip before the initial row is processed + */ + unsigned int rows_to_skip; + # ifdef ZSV_EXTRAS struct { size_t rows_interval; // min number of rows between progress callback calls @@ -175,10 +186,11 @@ struct zsv_opts { void *ctx; } completed; - /** + /* * maximum number of rows to parse (including any header rows) */ size_t max_rows; + # endif }; diff --git a/src/zsv.c b/src/zsv.c index 1fb2df80..2cc33cff 100644 --- a/src/zsv.c +++ b/src/zsv.c @@ -74,9 +74,6 @@ enum zsv_status zsv_parse_more(struct zsv_scanner *scanner) { return zsv_status_cancelled; // throw away the next row end - scanner->row_orig = scanner->opts.row; - scanner->row_ctx_orig = scanner->opts.ctx; - scanner->opts.row = zsv_throwaway_row; scanner->opts.ctx = scanner; @@ -138,6 +135,7 @@ size_t zsv_column_count(zsv_parser parser) { ZSV_EXPORT void zsv_set_row_handler(zsv_parser parser, void (*row)(void *ctx)) { parser->opts.row = row; + parser->row_orig = row; } ZSV_EXPORT @@ -201,7 +199,6 @@ ZSV_EXPORT enum zsv_status zsv_set_fixed_offsets(zsv_parser parser, size_t count parser->fixed.offsets[i] = offsets[i]; parser->mode = ZSV_MODE_FIXED; - parser->checked_bom = 1; return zsv_status_ok; @@ -282,11 +279,9 @@ enum zsv_status zsv_delete(zsv_parser parser) { if(parser->free_buff && parser->buff.buff) free(parser->buff.buff); - if(parser->row.cells) - free(parser->row.cells); - + free(parser->row.cells); free(parser->fixed.offsets); - + collate_header_destroy(&parser->collate_header); free(parser); } return zsv_status_ok; diff --git a/src/zsv_internal.c b/src/zsv_internal.c index ea074d77..7e56867c 100644 --- a/src/zsv_internal.c +++ b/src/zsv_internal.c @@ -23,6 +23,15 @@ struct zsv_row { struct zsv_cell *cells; }; +struct collate_header { + struct { + unsigned char *buff; + size_t used; + } buff; + size_t *lengths; // length PLUS 1 of each cell + size_t column_count; +}; + struct zsv_scanner { char last; struct { @@ -36,7 +45,9 @@ struct zsv_scanner { unsigned char waiting_for_end; struct zsv_opts opts; void (*row_orig)(void *ctx); - void *row_ctx_orig; + void (*cell_orig)(void *ctx, unsigned char *, size_t); + void *ctx_orig; + size_t row_start; struct zsv_row row; @@ -72,6 +83,8 @@ struct zsv_scanner { unsigned count; // number of offsets } fixed; + struct collate_header *collate_header; + unsigned char checked_bom:1; unsigned char free_buff:1; unsigned char finished:1; @@ -80,6 +93,79 @@ struct zsv_scanner { unsigned char _:3; }; +void collate_header_destroy(struct collate_header **chp) { + if(*chp) { + struct collate_header *ch = *chp; + free(ch->buff.buff); + free(ch->lengths); + free(ch); + *chp = NULL; + } +} + +/* collate_header_append(): return err */ +static int collate_header_append(struct zsv_scanner *scanner, struct collate_header **chp) { + if(!*chp) { + if((*chp = calloc(1, sizeof(struct collate_header)))) + (*chp)->lengths = calloc(scanner->row.allocated, sizeof(*(*chp)->lengths)); + if(!(*chp) || !(*chp)->lengths) { + free(*chp); + fprintf(stderr, "Out of memory!\n"); + return -1; + } + } + struct collate_header *ch = *chp; + size_t this_row_size = 0; + size_t column_count = zsv_column_count(scanner); + for(size_t i = 0, j = column_count; i < j; i++) + this_row_size += zsv_get_cell(scanner, i).len + 1; // +1: terminating null or delim + size_t new_row_size = ch->buff.used + this_row_size; + unsigned char *new_row = realloc(ch->buff.buff, new_row_size); + if(!new_row) { + fprintf(stderr, "Out of memory!\n"); + return -1; + } + + // now: splice the new row into the old row, starting with the last cell + // e.g. prior row = A1.B1.C1. + // this row = A2.B2.C2. + // new_row = A1.B1.C1.......... + // starting with last cell in this row, move the old data, then splice new: + // new_row = A1.B1.C1.......C2. + // new_row = A1.B1.C1....C1 C2. + // new_row = A1.B1.C1.B2.C1 C2. + // new_row = A1.B1.B1 B2.C1 C2. + // new_row = A1.A2.B1 B2.C1 C2. + // new_row = A1 A2.B1 B2.C1 C2. + + size_t new_row_end = ch->buff.used + this_row_size; + size_t old_row_end = ch->buff.used; + ch->buff.used += this_row_size; + ch->buff.buff = new_row; + for(size_t i = column_count; i > 0; i--) { + struct zsv_cell c = zsv_get_cell(scanner, i-1); + // copy new row's cell value to end + if(c.len) + memcpy(new_row + new_row_end - c.len - 1, c.str, c.len); + new_row[new_row_end - 1] = ' '; + new_row_end = new_row_end - c.len - 1; + + // move prior cell value + size_t old_cell_len = ch->lengths[i-1]; // old_cell_len includes delim + if(old_cell_len) { + memcpy(new_row + new_row_end - old_cell_len, + new_row + old_row_end - old_cell_len, + old_cell_len); + old_row_end -= old_cell_len; + new_row_end -= old_cell_len; + } + ch->lengths[i-1] += c.len + 1; + } + if(column_count > ch->column_count) + ch->column_count = column_count; + return 0; +} + __attribute__((always_inline)) static inline void zsv_clear_cell(struct zsv_scanner *scanner) { scanner->quoted = 0; } @@ -158,7 +244,7 @@ __attribute__((always_inline)) static inline enum zsv_status row_dl(struct zsv_s scanner->row.allocated + scanner->row.overflow, scanner->row.allocated); scanner->row.overflow = 0; } - if(LIKELY(scanner->opts.row)) + if(LIKELY(scanner->opts.row != NULL)) scanner->opts.row(scanner->opts.ctx); # ifdef ZSV_EXTRAS scanner->progress.cum_row_count++; @@ -198,7 +284,6 @@ __attribute__((always_inline)) static inline enum zsv_status row_dl(struct zsv_s if(VERY_UNLIKELY(scanner->abort)) return zsv_status_cancelled; scanner->have_cell = 0; -// if(scanner->row.used) scanner->row.used = 0; return zsv_status_ok; } @@ -504,10 +589,81 @@ enum zsv_status zsv_parse_string(struct zsv_scanner *scanner, return zsv_status_ok; } -static void zsv_throwaway_row(void *ctx) { +static void set_callbacks(struct zsv_scanner *scanner); + +static void skip_header_rows(void *ctx) { + struct zsv_scanner *scanner = ctx; + if(scanner->opts.rows_to_skip) + --scanner->opts.rows_to_skip; + if(!scanner->opts.rows_to_skip) + set_callbacks(scanner); +} + +static void collate_header_row(void *ctx) { struct zsv_scanner *scanner = ctx; - scanner->opts.row = scanner->row_orig; - scanner->opts.ctx = scanner->row_ctx_orig; + if(scanner->opts.header_span) { + --scanner->opts.header_span; + + // save this row + if(collate_header_append(scanner, &scanner->collate_header)) + scanner->abort = 1; + } + if(!scanner->opts.header_span) { + set_callbacks(scanner); + if(scanner->opts.row || scanner->opts.cell) { + if(scanner->collate_header) { + size_t offset = 0; + for(size_t i = 0; i < scanner->collate_header->column_count; i++) { + size_t len_plus1 = scanner->collate_header->lengths[i]; + if(len_plus1) { + scanner->row.cells[i].len = len_plus1 - 1; + scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset; + scanner->row.cells[i].str[len_plus1 - 1] = '\0'; + scanner->row.cells[i].quoted = 1; + } else { + scanner->row.cells[i].len = 0; + scanner->row.cells[i].str = (unsigned char *)""; + } + offset += len_plus1; + } + } + if(scanner->opts.cell) { + // call the user-provided cell() callback on each cell + unsigned char saved_quoted = scanner->quoted; + for(size_t i = 0, j = zsv_column_count(scanner); i < j; i++) { + struct zsv_cell c = zsv_get_cell(scanner, i); + scanner->quoted = c.quoted; + scanner->opts.cell(scanner->opts.ctx, c.str, c.len); + } + scanner->quoted = saved_quoted; + } + if(scanner->opts.row) + // call the user-provided row() callback + scanner->opts.row(scanner->opts.ctx); + + collate_header_destroy(&scanner->collate_header); + } + } +} + +static void set_callbacks(struct zsv_scanner *scanner) { + if(scanner->opts.rows_to_skip) { + scanner->opts.row = skip_header_rows; + scanner->opts.cell = NULL; + scanner->opts.ctx = scanner; + } else if(scanner->opts.header_span > 1) { + scanner->opts.row = collate_header_row; + scanner->opts.cell = NULL; + scanner->opts.ctx = scanner; + } else { + scanner->opts.row = scanner->row_orig; + scanner->opts.cell = scanner->cell_orig; + scanner->opts.ctx = scanner->ctx_orig; + } +} + +static void zsv_throwaway_row(void *ctx) { + set_callbacks(ctx); } static int zsv_scanner_init(struct zsv_scanner *scanner, @@ -557,11 +713,16 @@ static int zsv_scanner_init(struct zsv_scanner *scanner, # endif if(scanner->buff.buff) { scanner->opts = *opts; + scanner->row_orig = scanner->opts.row; + scanner->cell_orig = scanner->opts.cell; + scanner->ctx_orig = scanner->opts.ctx; if(!scanner->opts.max_columns) scanner->opts.max_columns = 1024; + set_callbacks(scanner); if((scanner->row.allocated = scanner->opts.max_columns) && (scanner->row.cells = calloc(scanner->row.allocated, sizeof(*scanner->row.cells)))) return 0; } + return 1; }