From 3329074ea6e45d6a3ef07ad76389b19e89195014 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Mon, 10 Apr 2023 18:58:25 +0200 Subject: [PATCH] DuckDB [todo] --- .gitignore | 3 + cli.js | 94 +- index.js | 289 +----- lib/agency.js | 58 +- lib/calendar.js | 113 +-- lib/calendar_dates.js | 85 +- lib/deps.js | 77 +- lib/feed_info.js | 88 +- lib/frequencies.js | 89 +- lib/get.js | 5 + lib/index.js | 9 +- lib/levels.js | 51 +- lib/pathways.js | 165 ++-- lib/prerequisites.js | 116 +-- lib/routes.js | 110 +-- lib/rows-count.js | 15 + lib/run.js | 5 + lib/service_days.js | 42 +- lib/shapes.js | 128 ++- lib/stats_active_trips_by_hour.js | 6 +- lib/stop_times.js | 565 +++++------ lib/stops.js | 157 ++-- lib/transfers.js | 118 ++- lib/translations.js | 1453 ++++++++++++++--------------- lib/trips.js | 89 +- lib/util.js | 12 - package.json | 11 +- readme.md | 4 + scripts/run-postgraphile.js | 84 -- test/amtrak-gtfs-2021-10-06.sh | 55 +- test/calendar-dates-only.sh | 15 +- test/index.sh | 3 +- test/invalid-empty-agency-id.sh | 8 +- test/multiple-schemas.sh | 19 +- test/postgraphile.sh | 31 - test/routes-without-agency-id.sh | 4 +- test/sample-gtfs-feed.sh | 38 +- test/stops-without-level-id.sh | 5 +- 38 files changed, 1846 insertions(+), 2373 deletions(-) create mode 100644 lib/get.js create mode 100644 lib/rows-count.js create mode 100644 lib/run.js delete mode 100644 lib/util.js delete mode 100755 scripts/run-postgraphile.js delete mode 100755 test/postgraphile.sh diff --git a/.gitignore b/.gitignore index e9d5a74..181f572 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,11 @@ pnpm-debug.log /shrinkwrap.yaml /test/amtrak-gtfs-2021-10-06 +/test/*.duckdb /*.gtfs /*.gtfs.zip /*.gtfs.tar.gz /*.gtfs.tar.zst + +/*.duckdb diff --git a/cli.js b/cli.js index ed0b0be..df8d27e 100755 --- a/cli.js +++ b/cli.js @@ -44,9 +44,6 @@ const { 'lower-case-lang-codes': { type: 'boolean', }, - 'stops-location-index': { - type: 'boolean', - }, 'stats-by-route-date': { type: 'string', }, @@ -59,21 +56,6 @@ const { 'schema': { type: 'string', }, - 'postgraphile': { - type: 'boolean', - }, - 'postgraphile-password': { - type: 'string', - }, - 'postgrest': { - type: 'boolean', - }, - 'postgrest-password': { - type: 'string', - }, - 'postgrest-query-cost-limit': { - type: 'string', - }, 'import-metadata': { type: 'boolean', } @@ -84,7 +66,7 @@ const { if (flags.help) { process.stdout.write(` Usage: - gtfs-to-sql [options] [--] ... + import-gtfs-into-duckdb [options] [--] ... Options: --silent -s Don't show files being converted. --require-dependencies -d Require files that the specified GTFS files depend @@ -101,8 +83,6 @@ Options: --routes-without-agency-id Don't require routes.txt items to have an agency_id. --stops-without-level-id Don't require stops.txt items to have a level_id. Default if levels.txt has not been provided. - --stops-location-index Create a spatial index on stops.stop_loc for efficient - queries by geolocation. --lower-case-lang-codes Accept Language Codes (e.g. in feed_info.feed_lang) with a different casing than the official BCP-47 language tags (as specified by the GTFS spec), @@ -123,34 +103,18 @@ Options: currently running trips over time, by hour. Like --stats-by-route-date, this flag accepts none, view & materialized-view. - --schema The schema to use for the database. Default: public - Even when importing into a schema other than \`public\`, - a function \`public.gtfs_via_postgres_import_version()\` - gets created, to ensure that multiple imports into the - same database are all made using the same version. See - also multiple-datasets.md in the docs. - --postgraphile Tweak generated SQL for PostGraphile usage. - https://www.graphile.org/postgraphile/ - --postgraphile-password Password for the PostGraphile PostgreSQL user. - Default: $POSTGRAPHILE_PGPASSWORD, fallback random. - --postgrest Tweak generated SQL for PostgREST usage. - Please combine it with --schema. - https://postgrest.org/ - --postgrest-password Password for the PostgREST PostgreSQL user \`web_anon\`. - Default: $POSTGREST_PGPASSWORD, fallback random. - --postgrest-query-cost-limit Define a cost limit [1] for queries executed by PostgREST - on behalf of a user. It is only enforced if - pg_plan_filter [2] is installed in the database! - Must be a positive float. Default: none - [1] https://www.postgresql.org/docs/14/using-explain.html - [2] https://github.com/pgexperts/pg_plan_filter + --schema The schema to use for the database. Default: main + May not contain \`.\`. --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - gtfs_via_postgres_version (text) - gtfs_via_postgres_options (jsonb) +Notes: + If you just want to check if the GTFS data can be imported but don't care about the + resulting DuckDB database file, you can import into an in-memory database by specifying + \`:memory:\` as the . Examples: - gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump + import-gtfs-into-duckdb some-gtfs.duckdb some-gtfs/*.txt [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J @@ -164,11 +128,11 @@ if (flags.version) { } const {basename, extname} = require('path') -const {pipeline} = require('stream') const convertGtfsToSql = require('./index') -const DataError = require('./lib/data-error') -const files = args.map((file) => { +const [pathToDb] = args + +const files = args.slice(1).map((file) => { const name = basename(file, extname(file)) return {name, file} }) @@ -184,9 +148,7 @@ const opt = { statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', statsActiveTripsByHour: flags['stats-active-trips-by-hour'] || 'none', - schema: flags['schema'] || 'public', - postgraphile: !!flags.postgraphile, - postgrest: !!flags.postgrest, + schema: flags['schema'] || 'main', importMetadata: !!flags['import-metadata'], } if ('stops-without-level-id' in flags) { @@ -195,31 +157,11 @@ if ('stops-without-level-id' in flags) { if ('lower-case-lang-codes' in flags) { opt.lowerCaseLanguageCodes = flags['lower-case-lang-codes'] } -if ('postgraphile-password' in flags) { - opt.postgraphilePassword = flags['postgraphile-password'] -} -if ('postgrest-password' in flags) { - opt.postgrestPassword = flags['postgrest-password'] -} -if ('postgrest-query-cost-limit' in flags) { - const limit = parseFloat(flags['postgrest-query-cost-limit']) - if (!Number.isFinite(limit) || limit < 0) { - console.error('Invalid --postgrest-query-cost-limit value.') - process.exit(1) - } - opt.lowerCaseLanguageCodes = limit -} -pipeline( - convertGtfsToSql(files, opt), - process.stdout, - (err) => { - if (!err) return; - if (err instanceof DataError) { - console.error(String(err)) - } else if (err.code !== 'EPIPE') { - console.error(err) - } - process.exit(1) +convertGtfsToSql(pathToDb, files, opt) +.catch((err) => { + if (err.code !== 'EPIPE') { // todo: check still necessary? we don't pipe anymore + console.error(err) } -) + process.exit(1) +}) diff --git a/index.js b/index.js index 34e8c12..c4c7771 100644 --- a/index.js +++ b/index.js @@ -1,16 +1,22 @@ 'use strict' -const debug = require('debug')('gtfs-via-postgres') -const {randomBytes} = require('crypto') +const createDebug = require('debug') const sequencify = require('sequencify') -const {inspect} = require('util') -const readCsv = require('gtfs-utils/read-csv') -const {Stringifier} = require('csv-stringify') +const {Database} = require('duckdb') +const {promisify} = require('util') const formatters = require('./lib') const getDependencies = require('./lib/deps') +const RUN = require('./lib/run.js') +const GET = require('./lib/get.js') const pkg = require('./package.json') -const convertGtfsToSql = async function* (files, opt = {}) { +// todo: rename +const debug = createDebug('gtfs-via-postgres') +const debugSql = createDebug('gtfs-via-postgres:sql') + +const convertGtfsToSql = async (pathToDb, files, opt = {}) => { + debug('pathToDb', pathToDb) + opt = { silent: false, requireDependencies: false, @@ -19,19 +25,11 @@ const convertGtfsToSql = async function* (files, opt = {}) { tripsWithoutShapeId: !files.some(f => f.name === 'shapes'), routesWithoutAgencyId: false, stopsWithoutLevelId: !files.some(f => f.name === 'levels'), - stopsLocationIndex: false, lowerCaseLanguageCodes: false, statsByRouteIdAndDate: 'none', statsByAgencyIdAndRouteIdAndStopAndHour: 'none', statsActiveTripsByHour: 'none', - schema: 'public', - postgraphile: false, - postgraphilePassword: process.env.POSTGRAPHILE_PGPASSWORD || null, - postgrest: false, - postgrestPassword: process.env.POSTGREST_PASSWORD || null, - // see https://github.com/pgexperts/pg_plan_filter - // see also https://www.postgresql.org/docs/14/using-explain.html - postgrestQueryCostLimit: null, // or float + schema: 'main', importMetadata: false, ...opt, } @@ -46,16 +44,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByAgencyIdAndRouteIdAndStopAndHour, statsActiveTripsByHour, } = opt - let postgraphilePassword = opt.postgraphilePassword - if (opt.postgraphile && postgraphilePassword === null) { - postgraphilePassword = randomBytes(10).toString('hex') - console.error(`PostGraphile PostgreSQL user's password:`, postgraphilePassword) - } - let postgrestPassword = opt.postgrestPassword - if (opt.postgrest && postgrestPassword === null) { - postgrestPassword = randomBytes(10).toString('hex') - console.error(`PostrREST PostgreSQL user's password:`, postgrestPassword) - } if (ignoreUnsupportedFiles) { files = files.filter(f => !!formatters[f.name]) @@ -67,17 +55,12 @@ const convertGtfsToSql = async function* (files, opt = {}) { debug('deps', deps) const tasks = { // file name -> [dep name] - 'is_valid_lang_code': { + 'valid_lang_codes': { dep: [], }, - 'is_timezone': { + 'valid_timezones': { dep: [], }, - ...(tripsWithoutShapeId ? {} : { - 'shape_exists': { - dep: [...deps.shape_exists], - }, - }), // special handling of calendar/calendar_dates: // service_days relies on *both* calendar's & calendar_dates' tables to @@ -99,28 +82,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { 'frequencies': { dep: [...deps.frequencies], }, - - ...(importMetadata ? { - 'import_metadata': { - dep: [], - }, - } : {}), - - ...(statsByRouteIdAndDate !== 'none' ? { - 'stats_by_route_date': { - dep: ['stop_times'], - }, - } : {}), - ...(statsByAgencyIdAndRouteIdAndStopAndHour !== 'none' ? { - 'stats_by_agency_route_stop_hour': { - dep: ['stop_times'], - }, - } : {}), - ...(statsActiveTripsByHour !== 'none' ? { - 'stats_active_trips_by_hour': { - dep: ['stop_times'], - }, - } : {}), } for (const file of files) { @@ -150,62 +111,31 @@ const convertGtfsToSql = async function* (files, opt = {}) { opt.importStart = Date.now() - yield `\ --- GTFS SQL dump generated by ${pkg.name} v${pkg.version} --- ${pkg.homepage} --- options: -${inspect(opt, {compact: false}).split('\n').map(line => '-- ' + line).join('\n')} - -\\set ON_ERROR_STOP on -CREATE EXTENSION IF NOT EXISTS postgis; -${opt.schema !== 'public' ? `CREATE SCHEMA IF NOT EXISTS "${opt.schema}";` : ''} -BEGIN; - --- gtfs-via-postgres supports importing >1 GTFS datasets into 1 DB, each dataset within its own schema. See https://github.com/public-transport/gtfs-via-postgres/issues/51 for more information. --- Because almost all helper utilities (enums, functions, etc.) are schema-specific, they get imported more than once. In order to prevent subtle bugs due to incompatibilities among two schemas imported by different gtfs-via-postgres versions, we mock a "mutex" here by checking for public.gtfs_via_postgres_import_version()'s return value. - --- todo: this can be done more elegantly: just a "DO" block, "ASSERT" that the version matches, create gtfs_via_postgres_import_version() in the "EXCEPTION" block -CREATE FUNCTION pg_temp.get_gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - DECLARE - res TEXT; - BEGIN - SELECT public.gtfs_via_postgres_import_version() INTO res; - RETURN res; - EXCEPTION - WHEN undefined_function THEN - -- do nothing, silence error - RETURN NULL; - END; -$$ -LANGUAGE plpgsql; - -DO $$ -BEGIN - IF EXISTS ( - SELECT version - FROM ( - SELECT pg_temp.get_gtfs_via_postgres_import_version() AS version - ) t - WHERE version != '${pkg.version}' - ) THEN - RAISE EXCEPTION 'existing GTFS data imported with an incompatible version of gtfs-via-postgres'; - END IF; -END -$$ -LANGUAGE plpgsql; - -CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - SELECT '${pkg.version}' -$$ -LANGUAGE sql; + const db = new Database(pathToDb) + const createQueryDb = (method, logPrefix) => { + const queryDb = (query, ...additionalArgs) => { + debugSql(logPrefix, query, ...additionalArgs) + return new Promise((resolve, reject) => { + db[method](query, ...additionalArgs, (err, result) => { + if (err) { + err.query = query + reject(err) + } else { + resolve(result) + } + }) + }) + } + return queryDb + } + db[RUN] = createQueryDb('run', 'DB[RUN]') + db[GET] = createQueryDb('all', 'DB[GET]') -\n` + await db[RUN](` +-- BEGIN TRANSACTION; +CREATE SCHEMA IF NOT EXISTS "${opt.schema}"; +`) - const csv = new Stringifier({quoted: true}) const nrOfRowsByName = new Map() const workingState = { nrOfRowsByName, @@ -214,145 +144,22 @@ LANGUAGE sql; for (const name of order) { if (!silent) console.error(name) const task = tasks[name] - yield `-- ${name}\n-----------------\n\n` - const { - beforeAll, - afterAll, - } = formatters[name] + const importData = formatters[name] - if ('string' === typeof beforeAll && beforeAll) { - yield beforeAll - } else if ('function' === typeof beforeAll) { - yield beforeAll(opt) - } - - if (task.file) { - const {formatRow} = formatters[name] - let nrOfRows = 0 - for await (const rawRow of await readCsv(task.file)) { - const row = formatRow(rawRow, opt, workingState) - let formattedRow = null - csv.api.__transform(row, (_formattedRow) => { - formattedRow = _formattedRow - }) - yield formattedRow - nrOfRows++ - } - - nrOfRowsByName.set(name, nrOfRows) - // todo [breaking]: indent with \t - // todo [breaking]: print a summary of all files instead - if (!silent) console.error(` processed ${nrOfRows} rows`) - } - - if ('string' === typeof afterAll && afterAll) { - yield afterAll + ';\n' - } else if ('function' === typeof afterAll) { - yield afterAll(opt) + ';\n' + try { + await importData(db, task.file || null, opt, workingState) + } catch (err) { + err.gtfsFile = name + throw err } } - yield `\ - -${opt.postgraphile ? `\ --- seal imported data --- todo: --- > Be careful with public schema.It already has a lot of default privileges that you maybe don't want... See documentation[1]. --- > [1]: postgresql.org/docs/11/ddl-schemas.html#DDL-SCHEMAS-PRIV -DO $$ -BEGIN - -- https://stackoverflow.com/questions/8092086/create-postgresql-role-user-if-it-doesnt-exist#8099557 - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgraphile' - ) THEN - RAISE NOTICE 'Role "postgraphile" already exists, skipping creation.'; - ELSE - CREATE ROLE postgraphile LOGIN PASSWORD '${opt.postgraphilePassword}'; -- todo: escape properly - END IF; -END -$$; -DO $$ - DECLARE - db TEXT := current_database(); - BEGIN - -- todo: grant just on $opt.schema instead? - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %I TO %I', db, 'postgraphile'); - END -$$; -GRANT USAGE ON SCHEMA "${opt.schema}" TO postgraphile; --- https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment50679407_762649 -REVOKE CREATE ON SCHEMA "${opt.schema}" FROM PUBLIC; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO postgraphile; --- ALTER DEFAULT PRIVILEGES IN SCHEMA "${opt.schema}" GRANT SELECT ON TABLES TO postgraphile; --- todo: set search_path? https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment33535263_762649 -` : ''} - -${opt.postgrest ? `\ -${opt.schema !== 'public' ? `\ --- pattern from https://stackoverflow.com/a/8099557 -DO -$$ -BEGIN - -- Roles are shared across databases, so we have remove previously configured privileges. - -- This might of course interfere with other programs running on the DBMS! - -- todo: find a cleaner solution - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'web_anon' - ) THEN - RAISE WARNING 'Role web_anon already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY web_anon TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE web_anon NOLOGIN NOINHERIT; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role web_anon was just created by a concurrent transaction.'; - END; - END IF; - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgrest' - ) THEN - RAISE WARNING 'Role postgrest already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY postgrest TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE postgrest LOGIN NOINHERIT NOCREATEDB NOCREATEROLE NOSUPERUSER PASSWORD '${postgrestPassword}'; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role postgrest was just created by a concurrent transaction.'; - END; - END IF; -END -$$; - - --- https://postgrest.org/en/stable/tutorials/tut0.html#step-4-create-database-for-api --- https://postgrest.org/en/stable/explanations/db_authz.html --- todo: is this secure? -GRANT USAGE ON SCHEMA "${opt.schema}" TO web_anon; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA "${opt.schema}" TO web_anon; - -GRANT web_anon TO postgrest; - -${opt.postgrestQueryCostLimit !== null ? ` --- If pg_plan_filter is installed, limit the cost of queries made by PostgREST users. -ALTER USER web_anon SET plan_filter.statement_cost_limit = ${opt.postgrestQueryCostLimit}; -` : ''} - -COMMENT ON SCHEMA "${opt.schema}" IS -$$GTFS REST API -This REST API is created by running [PostgREST](https://postgrest.org/) on top of a [PostgreSQL](https://www.postgresql.org) DB generated using [${pkg.name} v${pkg.version}](${pkg.homepage || pkg.repository}). -$$; -` : ''} -` : ''} + debug('workingState', workingState) -COMMIT;` + // todo + // await db[RUN]('COMMIT') + debug('done!') } module.exports = convertGtfsToSql diff --git a/lib/agency.js b/lib/agency.js index f9e3697..8ae1b07 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -1,50 +1,40 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#agencytxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToAgency, opt, workingState) => { + await db[RUN](`\ CREATE TABLE "${opt.schema}".agency ( agency_id TEXT PRIMARY KEY, agency_name TEXT NOT NULL, agency_url TEXT NOT NULL, - agency_timezone TEXT NOT NULL - CONSTRAINT valid_timezone CHECK ("${opt.schema}".is_timezone(agency_timezone)), + agency_timezone TEXT NOT NULL REFERENCES "${opt.schema}".valid_timezones (tz), agency_lang TEXT, -- todo: validate? agency_phone TEXT, agency_fare_url TEXT, agency_email TEXT ); -COPY "${opt.schema}".agency ( - agency_id, - agency_name, - agency_url, - agency_timezone, - agency_lang, - agency_phone, - agency_fare_url, - agency_email -) FROM STDIN csv; -` +INSERT INTO "${opt.schema}".agency +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToAgency}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] + -- todo: all_varchar = true, types +); +`) -const formatAgencyRow = (a) => { - return [ - a.agency_id || null, - a.agency_name || null, - a.agency_url || null, - a.agency_timezone || null, - a.agency_lang || null, - a.agency_phone || null, - a.agency_fare_url || null, - a.agency_email || null, - ] + workingState.nrOfRowsByName.set('agency', await queryNumberOfRows(db, 'agency', opt)) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatAgencyRow, - afterAll, -} +module.exports = importData diff --git a/lib/calendar.js b/lib/calendar.js index 7b59790..223a11f 100644 --- a/lib/calendar.js +++ b/lib/calendar.js @@ -1,67 +1,76 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#calendartxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".availability AS ENUM ( +const importData = async (db, pathToCalendar, opt, workingState) => { + await db[RUN](`\ +CREATE TYPE "${opt.schema}.availability" AS ENUM ( 'not_available' -- 0 – Service is not available for Mondays in the date range. , 'available' -- 1 – Service is available for all Mondays in the date range. ); -CREATE CAST ("${opt.schema}".availability AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST ("${opt.schema}.availability" AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar ( +CREATE TABLE "${opt.schema}.calendar" ( service_id TEXT PRIMARY KEY, - monday "${opt.schema}".availability NOT NULL, - tuesday "${opt.schema}".availability NOT NULL, - wednesday "${opt.schema}".availability NOT NULL, - thursday "${opt.schema}".availability NOT NULL, - friday "${opt.schema}".availability NOT NULL, - saturday "${opt.schema}".availability NOT NULL, - sunday "${opt.schema}".availability NOT NULL, + monday "${opt.schema}.availability" NOT NULL, + tuesday "${opt.schema}.availability" NOT NULL, + wednesday "${opt.schema}.availability" NOT NULL, + thursday "${opt.schema}.availability" NOT NULL, + friday "${opt.schema}.availability" NOT NULL, + saturday "${opt.schema}.availability" NOT NULL, + sunday "${opt.schema}.availability" NOT NULL, start_date DATE NOT NULL, end_date DATE NOT NULL ); +`) -COPY "${opt.schema}".calendar ( - service_id, - monday, - tuesday, - wednesday, - thursday, - friday, - saturday, - sunday, - start_date, - end_date -) FROM STDIN csv; -` - -const availability = (val) => { - if (val === '0') return 'not_available' - if (val === '1') return 'available' - throw new Error('invalid availability: ' + val) -} + if (pathToCalendar !== null) { + await db[RUN](`\ +INSERT INTO "${opt.schema}.calendar" +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::"${opt.schema}.availability")[monday + 1] AS monday, + enum_range(NULL::"${opt.schema}.availability")[tuesday + 1] AS tuesday, + enum_range(NULL::"${opt.schema}.availability")[wednesday + 1] AS wednesday, + enum_range(NULL::"${opt.schema}.availability")[thursday + 1] AS thursday, + enum_range(NULL::"${opt.schema}.availability")[friday + 1] AS friday, + enum_range(NULL::"${opt.schema}.availability")[saturday + 1] AS saturday, + enum_range(NULL::"${opt.schema}.availability")[sunday + 1] AS sunday, + array_slice(start_date, 0, 4) || '-' || array_slice(start_date, 5, 6) || '-' || array_slice(start_date, 7, 8) AS start_date, + array_slice(end_date, 0, 4) || '-' || array_slice(end_date, 5, 6) || '-' || array_slice(end_date, 7, 8) AS end_date +) +FROM read_csv( + '${pathToCalendar}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + service_id: 'TEXT', + monday: 'UINTEGER', + tuesday: 'UINTEGER', + wednesday: 'UINTEGER', + thursday: 'UINTEGER', + friday: 'UINTEGER', + saturday: 'UINTEGER', + sunday: 'UINTEGER', + start_date: 'TEXT', + end_date: 'TEXT' + } +); +`) + } -const formatCalendarRow = (c) => { - return [ - c.service_id || null, - c.monday ? availability(c.monday) : null, - c.tuesday ? availability(c.tuesday) : null, - c.wednesday ? availability(c.wednesday) : null, - c.thursday ? availability(c.thursday) : null, - c.friday ? availability(c.friday) : null, - c.saturday ? availability(c.saturday) : null, - c.sunday ? availability(c.sunday) : null, - c.start_date, - c.end_date, - ] + workingState.nrOfRowsByName.set('calendar', await queryNumberOfRows(db, 'calendar', opt)) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatCalendarRow, - afterAll, -} +module.exports = importData diff --git a/lib/calendar_dates.js b/lib/calendar_dates.js index 8c74e6a..abaef80 100644 --- a/lib/calendar_dates.js +++ b/lib/calendar_dates.js @@ -1,54 +1,61 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#calendar_datestxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".exception_type_v AS ENUM ( +const importData = async (db, pathToCalendarDates, opt, workingState) => { + await db[RUN](`\ +CREATE TYPE "${opt.schema}.exception_type_v" AS ENUM ( 'added' -- 1 – Service has been added for the specified date. , 'removed' -- 2 – Service has been removed for the specified date. ); -CREATE CAST ("${opt.schema}".exception_type_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST ("${opt.schema}.exception_type_v" AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar_dates ( +CREATE TABLE "${opt.schema}.calendar_dates" ( service_id TEXT NOT NULL, "date" DATE NOT NULL, PRIMARY KEY (service_id, "date"), - exception_type "${opt.schema}".exception_type_v NOT NULL + --CONSTRAINT primary_key PRIMARY KEY (service_id, "date"), + exception_type "${opt.schema}.exception_type_v" NOT NULL ); +`) + if (pathToCalendarDates !== null) { + await db[RUN](`\ +INSERT INTO "${opt.schema}.calendar_dates" +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + array_slice(date, 0, 4) || '-' || array_slice(date, 5, 6) || '-' || array_slice(date, 7, 8) AS date, + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::"${opt.schema}.exception_type_v")[exception_type] AS exception_type, +) +FROM read_csv( + '${pathToCalendarDates}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + service_id: 'TEXT', + date: 'TEXT', + exception_type: 'UINTEGER' + } +); +`) + } -COPY "${opt.schema}".calendar_dates ( - service_id, - date, - exception_type -) FROM STDIN csv; -` - -const exceptionType = (val) => { - if (val === '1') return 'added' - if (val === '2') return 'removed' - throw new Error('invalid exception_type: ' + val) -} + await db[RUN](`\ +CREATE INDEX calendar_dates_service_id ON "${opt.schema}.calendar_dates" (service_id); +CREATE INDEX calendar_dates_exception_type ON "${opt.schema}.calendar_dates" (exception_type); +`) -const formatCalendarDatesRow = (e) => { - return [ - e.service_id || null, - e.date, - e.exception_type ? exceptionType(e.exception_type) : null, - ] + workingState.nrOfRowsByName.set('calendar_dates', await queryNumberOfRows(db, 'calendar_dates', opt)) } -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON "${opt.schema}".calendar_dates (service_id); -CREATE INDEX ON "${opt.schema}".calendar_dates (exception_type); - -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".calendar_dates IS E'@foreignKey (service_id) references calendar|@fieldName calendar'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatCalendarDatesRow, - afterAll, -} +module.exports = importData diff --git a/lib/deps.js b/lib/deps.js index 5d9e2c3..fbd74eb 100644 --- a/lib/deps.js +++ b/lib/deps.js @@ -7,14 +7,11 @@ const getDependencies = (opt, files) => { stopsWithoutLevelId, } = opt return { - shape_exists: [ - 'shapes', - ], agency: [ - 'is_timezone', + 'valid_timezones', ], stops: [ - 'is_timezone', + 'valid_timezones', ...(stopsWithoutLevelId ? [] : ['levels']), ], transfers: [ @@ -32,7 +29,7 @@ const getDependencies = (opt, files) => { trips: [ 'routes', 'service_days', - ...(tripsWithoutShapeId ? [] : ['shapes', 'shape_exists']), + ...(tripsWithoutShapeId ? [] : ['shapes']), ], frequencies: [ 'trips', @@ -41,46 +38,46 @@ const getDependencies = (opt, files) => { 'stops', ], feed_info: [ - 'is_valid_lang_code', + 'valid_lang_codes', ], translations: [ - 'is_valid_lang_code', - // > table_name - // > Defines the dataset table that contains the field to be translated. The following values are allowed: - // > agency - // > stops - // > routes - // > trips - // > stop_times - // > feed_info - // > pathways - // > levels - // > attributions - // https://gtfs.org/schedule/reference/#translationstxt - // todo: respect opt.*! - // these are soft dependencies, they are not depended upon, they must only be imported first - // todo: only specify dependencies here if the files are not in use - 'agency', - 'stops', - 'routes', - 'trips', - ...(files.includes('stop_times') - ? ['stop_times'] - : [] - ), - ...(files.includes('feed_info') - ? ['feed_info'] - : [] - ), - ...(files.includes('pathways') - ? ['pathways'] - : [] - ), + 'valid_lang_codes', + // // > table_name + // // > Defines the dataset table that contains the field to be translated. The following values are allowed: + // // > agency + // // > stops + // // > routes + // // > trips + // // > stop_times + // // > feed_info + // // > pathways + // // > levels + // // > attributions + // // https://gtfs.org/schedule/reference/#translationstxt + // // todo: respect opt.*! + // // these are soft dependencies, they are not depended upon, they must only be imported first + // // todo: only specify dependencies here if the files are not in use + // 'agency', + // 'stops', + // 'routes', + // 'trips', + // ...(files.includes('stop_times') + // ? ['stop_times'] + // : [] + // ), + // ...(files.includes('feed_info') + // ? ['feed_info'] + // : [] + // ), + // ...(files.includes('pathways') + // ? ['pathways'] + // : [] + // ), ...(files.includes('levels') ? ['levels'] : [] ), - // not supported yet: attributions + // // not supported yet: attributions ], } } diff --git a/lib/feed_info.js b/lib/feed_info.js index 9057ac6..047ace8 100644 --- a/lib/feed_info.js +++ b/lib/feed_info.js @@ -1,7 +1,10 @@ 'use strict' +const RUN = require('./run.js') + // https://gtfs.org/schedule/reference/#feed_infotxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToFeedInfo, opt, workingState) => { + await db[RUN](`\ -- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate *_lang. -- https://github.com/MobilityData/gtfs-validator/blob/31ff374800f7d7883fd9de91b71049c2a4de4e45/main/src/main/java/org/mobilitydata/gtfsvalidator/validator/MatchingFeedAndAgencyLangValidator.java#L82 -- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html @@ -9,14 +12,23 @@ const beforeAll = (opt) => `\ CREATE TABLE "${opt.schema}".feed_info ( feed_publisher_name TEXT PRIMARY KEY, feed_publisher_url TEXT NOT NULL, - feed_lang TEXT NOT NULL - CONSTRAINT valid_feed_lang CHECK ( - "${opt.schema}".is_valid_lang_code(feed_lang) - ), - default_lang TEXT - CONSTRAINT valid_default_lang CHECK ( - default_lang IS NULL OR "${opt.schema}".is_valid_lang_code(default_lang) - ), + feed_lang TEXT NOT NULL, + -- todo: as of DuckDB v1.0.0, "subqueries prohibited in CHECK constraints" + -- CONSTRAINT valid_feed_lang CHECK ( + -- EXISTS (SELECT * FROM valid_lang_codes WHERE lang_code = feed_lang) + -- ), + -- todo: wait for https://github.com/duckdb/duckdb/issues/604 + -- > There are still a few places where collations […] are currently not considered: + -- > - […] + -- > - Support for collations in indexes + -- FOREIGN KEY (feed_lang) REFERENCES "${opt.schema}".valid_lang_codes, + default_lang TEXT, + -- todo: as of DuckDB v1.0.0, "subqueries prohibited in CHECK constraints" + -- CONSTRAINT valid_default_lang CHECK ( + -- EXISTS (SELECT * FROM valid_lang_codes WHERE lang_code = default_lang) + -- ), + -- todo: see above + -- FOREIGN KEY (default_lang) REFERENCES "${opt.schema}".valid_lang_codes, feed_start_date DATE, feed_end_date DATE, feed_version TEXT, @@ -24,39 +36,29 @@ CREATE TABLE "${opt.schema}".feed_info ( feed_contact_url TEXT ); -COPY "${opt.schema}".feed_info ( - feed_publisher_name, - feed_publisher_url, - feed_lang, - default_lang, - feed_start_date, - feed_end_date, - feed_version, - feed_contact_email, - feed_contact_url -) FROM STDIN csv; -` - -const formatFeedInfoRow = (i) => { - return [ - i.feed_publisher_name || null, - i.feed_publisher_url || null, - i.feed_lang || null, - i.default_lang || null, - i.feed_start_date || null, - i.feed_end_date || null, - i.feed_version || null, - i.feed_contact_email || null, - i.feed_contact_url || null, - ] +INSERT INTO "${opt.schema}".feed_info +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + ( + array_slice(feed_start_date, 0, 4) + || '-' || array_slice(feed_start_date, 5, 6) + || '-' || array_slice(feed_start_date, 7, 8) + ) AS feed_start_date, + ( + array_slice(feed_end_date, 0, 4) + || '-' || array_slice(feed_end_date, 5, 6) + || '-' || array_slice(feed_end_date, 7, 8) + ) AS feed_end_date +) +FROM read_csv( + '${pathToFeedInfo}', + header = true, + -- > Option to skip type detection for CSV parsing and assume all columns to be of type VARCHAR [a.k.a. TEXT]. + all_varchar = true +); +`) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatFeedInfoRow, - afterAll, -} +module.exports = importData diff --git a/lib/frequencies.js b/lib/frequencies.js index 1a894b7..e2c1002 100644 --- a/lib/frequencies.js +++ b/lib/frequencies.js @@ -1,9 +1,11 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/schedule/reference/#frequenciestxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToFrequencies, opt, workingState) => { + await db[RUN](`\ CREATE TYPE "${opt.schema}".exact_times_v AS ENUM ( 'frequency_based' -- 0 or empty - Frequency-based trips. , 'schedule_based' -- 1 – Schedule-based trips with the exact same headway throughout the day. In this case the end_time value must be greater than the last desired trip start_time but less than the last desired trip start_time + headway_secs. @@ -14,59 +16,52 @@ CREATE TABLE "${opt.schema}".frequencies ( trip_id TEXT NOT NULL, FOREIGN KEY (trip_id) REFERENCES "${opt.schema}".trips, start_time INTERVAL NOT NULL, + -- todo, once support by DuckDB: PRIMARY KEY (trip_id, start_time) end_time INTERVAL NOT NULL, headway_secs INT NOT NULL, - exact_times "${opt.schema}".exact_times_v, - UNIQUE ( - trip_id, - start_time, - end_time, - headway_secs, - exact_times - ) + exact_times "${opt.schema}".exact_times_v -- todo: NOT NULL & ifnull() ); -COPY "${opt.schema}".frequencies ( +INSERT INTO "${opt.schema}".frequencies +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::exact_times_v)[exact_times + 1] AS exact_times +) +FROM read_csv( + '${pathToFrequencies}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + start_time: 'INTERVAL', + end_time: 'INTERVAL', + exact_times: 'INTEGER', + } +); + +-- We create UNIQUE index *afterwards* to make the data import faster. +CREATE UNIQUE INDEX frequencies_unique ON "${opt.schema}".frequencies ( trip_id, - start_time, - end_time, + -- As of v1.0.0, DuckDB does not support UNIQUE indexes on INTERVAL columns yet, so we cast to INTEGER. + (start_time::string), + (end_time::string), headway_secs, exact_times -) FROM STDIN csv; -` - -const exactTimes = (val) => { - if (val === '0') return 'frequency_based' - if (val === '1') return 'schedule_based' - throw new Error('invalid exact_times: ' + val) -} +); -const formatFrequenciesRow = (f) => { - const startTime = f.start_time - ? formatTime(f.start_time) - : null - const endTime = f.end_time - ? formatTime(f.end_time) - : null +CREATE INDEX frequencies_trip_id ON "${opt.schema}".frequencies (trip_id); +CREATE INDEX frequencies_exact_times ON "${opt.schema}".frequencies (exact_times); +`) - return [ - f.trip_id || null, - startTime, - endTime, - f.headway_secs ? parseInt(f.headway_secs) : null, - f.exact_times ? exactTimes(f.exact_times) : null, - ] + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON "${opt.schema}".frequencies (trip_id); -CREATE INDEX ON "${opt.schema}".frequencies (exact_times); -` - -module.exports = { - beforeAll, - formatRow: formatFrequenciesRow, - afterAll, -} +module.exports = importData diff --git a/lib/get.js b/lib/get.js new file mode 100644 index 0000000..9affca3 --- /dev/null +++ b/lib/get.js @@ -0,0 +1,5 @@ +'use strict' + +const GET = Symbol('get') + +module.exports = GET \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index 9490a3c..0bf4cda 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,8 +1,9 @@ 'use strict' module.exports = { - is_valid_lang_code: require('./prerequisites').is_valid_lang_code, - is_timezone: require('./prerequisites').is_timezone, + icu: require('./prerequisites').icu, + valid_lang_codes: require('./prerequisites').valid_lang_codes, + valid_timezones: require('./prerequisites').valid_timezones, shape_exists: require('./prerequisites').shape_exists, agency: require('./agency'), calendar: require('./calendar'), @@ -19,8 +20,4 @@ module.exports = { pathways: require('./pathways'), levels: require('./levels'), translations: require('./translations'), - import_metadata: require('./import_metadata'), - stats_by_route_date: require('./stats_by_route_date'), - stats_by_agency_route_stop_hour: require('./stats_by_agency_route_stop_hour'), - stats_active_trips_by_hour: require('./stats_active_trips_by_hour'), } diff --git a/lib/levels.js b/lib/levels.js index baf4b75..7bfe2b0 100644 --- a/lib/levels.js +++ b/lib/levels.js @@ -1,36 +1,39 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/schedule/reference/#levelstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToLevels, opt, workingState) => { + await db[RUN](`\ CREATE TABLE "${opt.schema}".levels ( level_id TEXT PRIMARY KEY, - level_index DOUBLE PRECISION NOT NULL, + level_index REAL NOT NULL, level_name TEXT ); -COPY "${opt.schema}".levels ( - level_id, - level_index, - level_name -) FROM STDIN csv; -` - -const formatLevelsRow = (l) => { - return [ - l.level_id, - parseFloat(l.level_index), - l.level_name || null, - ] -} +INSERT INTO "${opt.schema}".levels +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToLevels}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + level_index: 'REAL', + } +); -const afterAll = `\ -\\. -` +`) -module.exports = { - beforeAll, - formatRow: formatLevelsRow, - afterAll, + workingState.nrOfRowsByName.set('levels', await queryNumberOfRows(db, 'levels', opt)) } + +module.exports = importData diff --git a/lib/pathways.js b/lib/pathways.js index 10b0b7a..2cb5a08 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -1,9 +1,75 @@ 'use strict' -const {formatTime} = require('./util') +const GET = require('./get.js') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/schedule/reference/#pathwaystxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToPathways, opt, workingState) => { + // Several columns are optional, so their columns may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const [{ + has_length, + has_traversal_time, + has_stair_count, + has_max_slope, + has_min_width, + has_signposted_as, + has_reversed_signposted_as, + }] = await db[GET](`\ + WITH columns AS ( + SELECT * + FROM ( + DESCRIBE ( + SELECT * + FROM read_csv( + '${pathToPathways}', + header = true + ) + LIMIT 1 + ) + ) + ) + SELECT + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'length' + ) AS has_length, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'traversal_time' + ) AS has_traversal_time, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'stair_count' + ) AS has_stair_count, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'max_slope' + ) AS has_max_slope, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'min_width' + ) AS has_min_width, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'signposted_as' + ) AS has_signposted_as, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'reversed_signposted_as' + ) AS has_reversed_signposted_as +`) + + await db[RUN](`\ CREATE TYPE "${opt.schema}".pathway_mode_v AS ENUM ( 'walkway' -- 1 , 'stairs' -- 2 @@ -24,75 +90,44 @@ CREATE TABLE "${opt.schema}".pathways ( FOREIGN KEY (to_stop_id) REFERENCES "${opt.schema}".stops (stop_id), pathway_mode "${opt.schema}".pathway_mode_v NOT NULL, is_bidirectional BOOLEAN NOT NULL, - length DOUBLE PRECISION, -- todo: add non-negative constraint + length REAL, -- todo: add non-negative constraint traversal_time INTEGER, -- todo: add positive constraint stair_count INTEGER, -- todo: add non-0 constraint - max_slope DOUBLE PRECISION, - min_width DOUBLE PRECISION, -- todo: add positive constraint + max_slope REAL, + min_width REAL, -- todo: add positive constraint signposted_as TEXT, reversed_signposted_as TEXT ); -COPY "${opt.schema}".pathways ( - pathway_id, - from_stop_id, - to_stop_id, - pathway_mode, - is_bidirectional, - length, - traversal_time, - stair_count, - max_slope, - min_width, - signposted_as, - reversed_signposted_as -) FROM STDIN csv; -` - -const pathwayMode = (val) => { - if (val === '1') return 'walkway' - if (val === '2') return 'stairs' - if (val === '3') return 'moving_sidewalk_travelator' - if (val === '4') return 'escalator' - if (val === '5') return 'elevator' - if (val === '6') return 'fare_gate' - if (val === '7') return 'exit_gate' - throw new Error('invalid pathway_mode: ' + val) -} - -const formatPathwaysRow = (p) => { - let is_bidirectional - if (p.is_bidirectional === '0') is_bidirectional = 'false' - else if (p.is_bidirectional === '1') is_bidirectional = 'true' - else throw new Error('invalid is_bidirectional: ' + p.is_bidirectional) +INSERT INTO "${opt.schema}".pathways +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- todo: check that is_bidirectional is actually 0 or 1 + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::pathway_mode_v)[pathway_mode] AS pathway_mode +) +FROM read_csv( + '${pathToPathways}', + header = true, + all_varchar = true, + types = { + pathway_mode: 'INTEGER', + is_bidirectional: 'INTEGER' + ${has_length ? `, length: 'REAL'` : ``} + ${has_traversal_time ? `, traversal_time: 'INTEGER'` : ``} + ${has_stair_count ? `, stair_count: 'INTEGER'` : ``} + ${has_max_slope ? `, max_slope: 'REAL'` : ``} + ${has_min_width ? `, min_width: 'REAL'` : ``} + ${has_signposted_as ? `, signposted_as: 'TEXT'` : ``} + ${has_reversed_signposted_as ? `, reversed_signposted_as: 'TEXT'` : ``} + } +); +`) - return [ - p.pathway_id, - p.from_stop_id, - p.to_stop_id, - pathwayMode(p.pathway_mode), - is_bidirectional, - p.length, - p.traversal_time, - p.stair_count, - p.max_slope, - p.min_width, - p.signposted_as || null, - p.reversed_signposted_as || null, - ] + workingState.nrOfRowsByName.set('pathways', await queryNumberOfRows(db, 'pathways', opt)) } -const afterAll = (opt) => `\ -\\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".pathways (from_stop_id); -CREATE INDEX ON "${opt.schema}".pathways (to_stop_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatPathwaysRow, - afterAll, -} +module.exports = importData diff --git a/lib/prerequisites.js b/lib/prerequisites.js index 4a5edcc..d88091d 100644 --- a/lib/prerequisites.js +++ b/lib/prerequisites.js @@ -1,91 +1,47 @@ 'use strict' -const is_valid_lang_code = { - beforeAll: (opt) => `\ --- Unfortunately information_schema.collations.collation_name only has --- identifiers with "_", not with "-", so we use pg_collation instead. --- https://www.postgresql.org/docs/current/infoschema-collations.html --- https://www.postgresql.org/docs/current/catalog-pg-collation.html --- todo [breaking]: rename to e.g. is_similar_to_bcp_47_tag? -CREATE OR REPLACE FUNCTION "${opt.schema}".is_bcp_47_tag( - input TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT collctype - FROM pg_collation - WHERE ${opt.lowerCaseLanguageCodes ? `lower(collctype)` : `collctype`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} || '-x-icu' - LIMIT 1 - ); -$$ language sql STABLE; - -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_bcp_47_tag IS E'@omit'; -` : ''} +const RUN = require('./run.js') --- todo [breaking]: remove -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_lang_code( - input TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT "${opt.schema}".is_bcp_47_tag(input); -$$ language sql STABLE; +const valid_lang_codes = async (db, _, opt) => { + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_lang_code IS E'@omit'; -` : ''} -`, +-- Unfortunately pragma_collations().collname only has +-- identifiers with "_", not with "-", so we use pg_collation instead. +-- see also https://duckdb.org/docs/sql/expressions/collations#icu-collations +-- todo: Also, entries like "de_DE" are missing. +CREATE TABLE "${opt.schema}".valid_lang_codes ( + lang_code TEXT PRIMARY KEY COLLATE NOCASE, +); +INSERT INTO "${opt.schema}".valid_lang_codes +SELECT + replace(collname, '_', '-') AS lang_code +FROM pragma_collations(); +`) } -const is_timezone = { - beforeAll: (opt) => `\ --- https://justatheory.com/2007/11/postgres-timezone-validation/ -CREATE OR REPLACE FUNCTION "${opt.schema}".is_timezone( - tz TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - date TIMESTAMPTZ; - BEGIN - date := now() AT TIME ZONE tz; - RETURN TRUE; - EXCEPTION WHEN invalid_parameter_value THEN - RETURN FALSE; - END; -$$ language plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_timezone IS E'@omit'; -` : ''} -`, -} -const shape_exists = { - beforeAll: (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".shape_exists( - some_shape_id TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT shape_id - FROM "${opt.schema}".shapes - WHERE shape_id = some_shape_id - LIMIT 1 - ); -$$ language sql STABLE; +const valid_timezones = async (db, _, opt) => { + // DuckDB v0.10: "subqueries prohibited in CHECK constraints" + // > CONSTRAINT valid_timezone CHECK ("${opt.schema}".is_timezone(agency_timezone)) + // or inlined: + // > CONSTRAINT valid_timezone CHECK (EXISTS(SELECT name FROM pg_timezone_names() WHERE name = agency_timezone)) + // so we create a helper table instead + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".shape_exists IS E'@omit'; -` : ''} -`, +CREATE TABLE "${opt.schema}".valid_timezones( + tz TEXT PRIMARY KEY +); +INSERT INTO "${opt.schema}".valid_timezones ( + SELECT name AS tz + FROM pg_timezone_names() +); +`) } module.exports = { - is_valid_lang_code, - is_timezone, - shape_exists, + valid_lang_codes, + valid_timezones, } diff --git a/lib/routes.js b/lib/routes.js index fe416e5..7267074 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -1,6 +1,8 @@ 'use strict' -const DataError = require('./data-error') +// const DataError = require('./data-error') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // Google's "Extended GTFS Route Types" // https://developers.google.com/transit/gtfs/reference/extended-route-types @@ -232,13 +234,26 @@ const routeTypesSchemes = Object.assign(Object.create(null), { }) // https://gtfs.org/schedule/reference/#routestxt -const beforeAll = (opt) => { +const importData = async (db, pathToRoutes, opt, workingState) => { if (!(opt.routeTypesScheme in routeTypesSchemes)) { throw new Error(`invalid opt.routeTypesScheme, must be one of these: ${Object.keys(routeTypesSchemes).join(', ')}.`) } const extRouteTypes = routeTypesSchemes[opt.routeTypesScheme] - return `\ + // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. + // It seems that GTFS has allowed this at least since 2016: + // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 + const exactly1Agency = workingState.nrOfRowsByName.get('agency') === 1 + // todo: throw special error indicating an error in the input data? does the foreign key constraint achieve this implicitly? old code: + // throw new DataError( + // 'routes', + // 'agency_id must not be empty/null', + // [ + // 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' + // ], + // ) + + await db[RUN](`\ CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( -- basic types '0' -- 0 – Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area. @@ -255,16 +270,14 @@ CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( -- extended types ${extRouteTypes.map(([route_type, desc]) => `, '${route_type}' -- ${desc}`).join('\n')} ); -CREATE CAST ("${opt.schema}".route_type_val AS text) WITH INOUT AS IMPLICIT; --- todo [breaking]: use small table as enum? https://www.graphile.org/postgraphile/enums/#with-enum-tables -${opt.postgraphile ? `\ -COMMENT ON TYPE "${opt.schema}".route_type_val IS E'@enum\\n@enumName RouteType\\n'; -` : ''} CREATE TABLE "${opt.schema}".routes ( route_id TEXT PRIMARY KEY, agency_id TEXT, - ${opt.routesWithoutAgencyId ? '' : `FOREIGN KEY (agency_id) REFERENCES "${opt.schema}".agency,`} + ${opt.routesWithoutAgencyId || exactly1Agency + ? '' + : `FOREIGN KEY (agency_id) REFERENCES "${opt.schema}".agency,` + } -- todo: Either route_short_name or route_long_name must be specified, or potentially both if appropriate. route_short_name TEXT, route_long_name TEXT, @@ -276,64 +289,29 @@ CREATE TABLE "${opt.schema}".routes ( route_sort_order INT ); -COPY "${opt.schema}".routes ( - route_id, - agency_id, - route_short_name, - route_long_name, - route_desc, - route_type, - route_url, - route_color, - route_text_color, - route_sort_order -) FROM STDIN csv; -` -} - -const formatRoutesRow = (r, opt, workingState) => { - const agency_id = r.agency_id || null - if (agency_id === null && !opt.routesWithoutAgencyId) { - // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. - // It seems that GTFS has allowed this at least since 2016: - // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - if (workingState.nrOfRowsByName.get('agency') !== 1) { - // todo: throw special error indicating an error in the input data - throw new DataError( - 'routes', - 'agency_id must not be empty/null', - [ - 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' - ], - ) - } +INSERT INTO "${opt.schema}".routes +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToRoutes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + route_type: 'TEXT', } +); - return [ - r.route_id || null, - agency_id, - r.route_short_name || null, - r.route_long_name || null, - r.route_desc || null, - r.route_type || null, - r.route_url || null, - r.route_color || null, - r.route_text_color || null, - r.route_sort_order ? parseInt(r.route_sort_order) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON "${opt.schema}".routes (route_short_name); -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".routes (agency_id); -` : ''} -` +CREATE INDEX routes_route_short_name ON "${opt.schema}".routes (route_short_name); +`) -module.exports = { - beforeAll, - formatRow: formatRoutesRow, - afterAll, + workingState.nrOfRowsByName.set('routes', await queryNumberOfRows(db, 'routes', opt)) } + +module.exports = importData diff --git a/lib/rows-count.js b/lib/rows-count.js new file mode 100644 index 0000000..8107ca9 --- /dev/null +++ b/lib/rows-count.js @@ -0,0 +1,15 @@ +'use strict' + +const GET = require('./get.js') + +const queryNumberOfRows = async (db, dbName, opt) => { + const [{count: nrOfRows}] = await db[GET](` + SELECT count(*) AS count + FROM "${opt.schema}.${dbName}" + `) + return nrOfRows +} + +module.exports = { + queryNumberOfRows, +} \ No newline at end of file diff --git a/lib/run.js b/lib/run.js new file mode 100644 index 0000000..a155611 --- /dev/null +++ b/lib/run.js @@ -0,0 +1,5 @@ +'use strict' + +const RUN = Symbol('run') + +module.exports = RUN diff --git a/lib/service_days.js b/lib/service_days.js index 5df4b1f..33c71f4 100644 --- a/lib/service_days.js +++ b/lib/service_days.js @@ -1,8 +1,15 @@ 'use strict' -const afterAll = (opt) => `\ +const RUN = require('./run.js') + +// https://gtfs.org/schedule/reference/#calendar_datestxt +const importData = async (db, _, opt, workingState) => { + await db[RUN](`\ +-- DuckDB currently has no materialized views, only tables. +-- see https://github.com/duckdb/duckdb/discussions/3638#discussioncomment-2801284 +-- todo: what if i modify calendar/calendar_dates? define triggers? -- todo [breaking]: rename to service_dates? -CREATE MATERIALIZED VIEW "${opt.schema}".service_days AS +CREATE TABLE "${opt.schema}.service_days" AS SELECT base_days.service_id, base_days.date @@ -16,7 +23,7 @@ FROM ( SELECT service_id, "date", - extract(dow FROM "date") dow, + date_part('dow', "date") dow, sunday, monday, tuesday, @@ -27,12 +34,12 @@ FROM ( FROM ( SELECT *, - generate_series( + unnest(generate_series( start_date::TIMESTAMP, end_date::TIMESTAMP, '1 day'::INTERVAL - ) "date" - FROM "${opt.schema}".calendar + )) "date" + FROM "${opt.schema}.calendar" ) all_days_raw ) all_days WHERE (sunday = 'available' AND dow = 0) @@ -47,7 +54,7 @@ FROM ( -- "removed" exceptions LEFT JOIN ( SELECT * - FROM "${opt.schema}".calendar_dates + FROM "${opt.schema}.calendar_dates" WHERE exception_type = 'removed' ) removed ON base_days.service_id = removed.service_id @@ -56,23 +63,18 @@ WHERE removed.date IS NULL -- "added" exceptions UNION SELECT service_id, "date" -FROM "${opt.schema}".calendar_dates +FROM "${opt.schema}.calendar_dates" WHERE exception_type = 'added' ORDER BY service_id, "date"; -CREATE UNIQUE INDEX ON "${opt.schema}".service_days (service_id, date); +CREATE UNIQUE INDEX service_days_unique_service_id_date ON "${opt.schema}.service_days" (service_id, date); -CREATE INDEX ON "${opt.schema}".service_days (service_id); -CREATE INDEX ON "${opt.schema}".service_days (date); +CREATE INDEX service_days_service_id ON "${opt.schema}.service_days" (service_id); +CREATE INDEX service_days_date ON "${opt.schema}.service_days" (date); -- apparently the unique index (service_id, date) doesn't speed up queries -CREATE INDEX ON "${opt.schema}".service_days (service_id, date); - -${opt.postgraphile ? `\ -COMMENT ON MATERIALIZED VIEW "${opt.schema}".service_days IS E'@name serviceDates\\n@primaryKey service_id,date'; -` : ''} -` - -module.exports = { - afterAll, +CREATE INDEX service_days_service_id_date ON "${opt.schema}.service_days" (service_id, date); +`) } + +module.exports = importData diff --git a/lib/shapes.js b/lib/shapes.js index 2f2384d..e1a05d7 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -1,62 +1,88 @@ 'use strict' -// https://gtfs.org/schedule/reference/#shapestxt -const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".shapes ( - id SERIAL PRIMARY KEY, - shape_id TEXT, - shape_pt_sequence INT, - shape_pt_loc geography(POINT), - shape_dist_traveled REAL -); +const GET = require('./get.js') -COPY "${opt.schema}".shapes ( - shape_id, - shape_pt_loc, - shape_pt_sequence, - shape_dist_traveled -) FROM STDIN csv; -` - -const formatShapesRow = (s) => { - return [ - s.shape_id || null, - `POINT(${parseFloat(s.shape_pt_lon)} ${parseFloat(s.shape_pt_lat)})`, - s.shape_pt_sequence ? parseInt(s.shape_pt_sequence) : null, - s.shape_dist_traveled ? parseInt(s.shape_dist_traveled) : null, - ] -} +// https://gtfs.org/schedule/reference/#shapestxt +const importData = async (db, pathToShapes, opt, workingState) => { + // shape_dist_traveled is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const [ + {has_shape_dist_traveled}, + ] = await db[GET](`\ + SELECT EXISTS( + SELECT * + FROM ( + DESCRIBE ( + SELECT * + FROM read_csv( + '${pathToShapes}', + header = true + ) + LIMIT 1 + ) + ) columns + WHERE column_name = 'shape_dist_traveled' + ) AS has_shape_dist_traveled +`) -const afterAll = (opt) => `\ -\\. + // todo: why does extracting `Count` directly work here and not with other files? + const [ + {Count: nrOfShapes}, + ] = await db[GET](`\ +INSTALL spatial; -- todo: make install optional? +LOAD spatial; -CREATE INDEX shapes_by_shape_id ON "${opt.schema}".shapes (shape_id); -CREATE INDEX ON "${opt.schema}".shapes (shape_id, shape_pt_sequence); +CREATE TABLE "${opt.schema}".shapes ( + shape_id TEXT PRIMARY KEY, + shape GEOMETRY, + distances_travelled REAL[] +); -CREATE OR REPLACE VIEW "${opt.schema}".shapes_aggregated AS +INSERT INTO "${opt.schema}".shapes +-- WITH +-- csv_columns AS ( +-- SELECT list(column_name) AS cols +-- FROM ( +-- DESCRIBE ( +-- SELECT * +-- FROM read_csv( +-- 'node_modules/sample-gtfs-feed/gtfs/shapes.txt', +-- header = true +-- ) +-- ) +-- ) columns +-- ), +-- table_columns AS ( +-- SELECT list(column_name) +-- FROM ( +-- DESCRIBE shapes +-- ) columns +-- ) +-- SELECT COLUMNS(x -> x IN (SELECT cols FROM csv_columns)) SELECT - shape_id, - array_agg(shape_dist_traveled) AS distances_travelled, - ST_MakeLine(array_agg(shape_pt_loc)) AS shape + any_value(shape_id) AS shape_id, + ST_MakeLine(array_agg(ST_Point(shape_pt_lon, shape_pt_lat))) AS shape, + ${has_shape_dist_traveled ? `array_agg(shape_dist_traveled)` : `NULL`} AS distances_travelled FROM ( - SELECT - shape_id, - shape_dist_traveled, - ST_AsText(shape_pt_loc)::geometry AS shape_pt_loc - FROM "${opt.schema}".shapes - ORDER by shape_id, shape_pt_sequence -) shapes + SELECT * + FROM read_csv( + '${pathToShapes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] + -- todo: all_varchar = true, types + ) + ORDER BY shape_id, shape_pt_sequence +) t GROUP BY shape_id; +`) -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".shapes IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".shapes.id IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".shapes_aggregated IS E'@name shapes\\n@primaryKey shape_id'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatShapesRow, - afterAll, + // Note: This is not the number of shapes.txt rows! + workingState.nrOfRowsByName.set('shapes', nrOfShapes) } + +module.exports = importData diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js index e369261..2b2b4b9 100644 --- a/lib/stats_active_trips_by_hour.js +++ b/lib/stats_active_trips_by_hour.js @@ -21,10 +21,8 @@ WITH FROM "${opt.schema}".service_days ), date_offset AS ( - SELECT greatest( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() - ) AS o + SELECT largest + FROM largest_arr_dep_time ), date_min_max AS ( SELECT diff --git a/lib/stop_times.js b/lib/stop_times.js index dd1c37c..d8ac52a 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -1,9 +1,45 @@ 'use strict' -const {formatTime} = require('./util') +const GET = require('./get.js') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/schedule/reference/#stop_timestxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToStopTimes, opt, workingState) => { + // timepoint & shape_dist_traveled are optional, so the entire columns can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const [{ + has_shape_dist_traveled, + has_timepoint, + }] = await db[GET](`\ + WITH columns AS ( + SELECT * + FROM ( + DESCRIBE ( + SELECT * + FROM read_csv( + '${pathToStopTimes}', + header = true + ) + LIMIT 1 + ) + ) + ) + SELECT + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'timepoint' + ) AS has_timepoint, + EXISTS( + SELECT * + FROM columns + WHERE column_name = 'shape_dist_traveled' + ) AS has_shape_dist_traveled +`) + + await db[RUN](`\ CREATE TYPE "${opt.schema}".pickup_drop_off_type AS ENUM ( 'regular' -- 0 or empty - Regularly scheduled pickup/dropoff. , 'not_available' -- 1 – No pickup/dropoff available. @@ -29,139 +65,111 @@ CREATE TABLE "${opt.schema}".stop_times ( stop_sequence INT NOT NULL, stop_sequence_consec INT, stop_headsign TEXT, - pickup_type "${opt.schema}".pickup_drop_off_type, - drop_off_type "${opt.schema}".pickup_drop_off_type, + pickup_type "${opt.schema}".pickup_drop_off_type, -- todo: NOT NULL & ifnull() + drop_off_type "${opt.schema}".pickup_drop_off_type, -- todo: NOT NULL & ifnull() shape_dist_traveled REAL, - timepoint "${opt.schema}".timepoint_v + timepoint "${opt.schema}".timepoint_v, + PRIMARY KEY (trip_id, stop_sequence) ); -COPY "${opt.schema}".stop_times ( - trip_id, - arrival_time, - departure_time, - stop_id, - stop_sequence, - stop_headsign, - pickup_type, - drop_off_type, - shape_dist_traveled, - timepoint -) FROM STDIN csv; -` - -const pickupDropOffType = (val) => { - if (val === '0') return 'regular' - if (val === '1') return 'not_available' - if (val === '2') return 'call' - if (val === '3') return 'driver' - throw new Error('invalid/unsupported pickup_type/drop_off_type: ' + val) -} - -const timepoint = (val) => { - if (val === '0') return 'approximate' - if (val === '1') return 'exact' - throw new Error('invalid/unsupported timepoint_v: ' + val) -} - -const formatStopTimesRow = (s) => { - const arrTime = s.arrival_time - ? formatTime(s.arrival_time) - : null - const depTime = s.departure_time - ? formatTime(s.departure_time) - : null - - return [ - s.trip_id || null, - arrTime, - depTime, - s.stop_id || null, - s.stop_sequence ? parseInt(s.stop_sequence) : null, - s.stop_headsign || null, - s.pickup_type ? pickupDropOffType(s.pickup_type) : null, - s.drop_off_type ? pickupDropOffType(s.drop_off_type) : null, - s.shape_dist_traveled || null, - s.timepoint ? timepoint(s.timepoint) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON "${opt.schema}".stop_times (trip_id); -CREATE INDEX ON "${opt.schema}".stop_times (stop_id); - -${opt.postgraphile ? `\ -COMMENT ON COLUMN "${opt.schema}".stop_times.stop_sequence_consec IS E'@name stopSequenceConsecutive'; -` : ''} - -UPDATE "${opt.schema}".stop_times -SET stop_sequence_consec = t.seq -FROM ( - SELECT - row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC)::integer - 1 AS seq, - trip_id, stop_sequence - FROM "${opt.schema}".stop_times -) AS t -WHERE "${opt.schema}".stop_times.trip_id = t.trip_id -AND "${opt.schema}".stop_times.stop_sequence = t.stop_sequence; - -CREATE INDEX ON "${opt.schema}".stop_times (stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (trip_id, stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time DESC NULLS LAST); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time DESC NULLS LAST); --- todo: are these two necessary? -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time); - -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_departure_time () -RETURNS interval AS $$ - SELECT departure_time - FROM "${opt.schema}".stop_times - WHERE EXISTS ( - SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id - ) - ORDER BY departure_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_arrival_time () -RETURNS interval AS $$ - SELECT arrival_time - FROM "${opt.schema}".stop_times - WHERE EXISTS ( - SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id +INSERT INTO "${opt.schema}".stop_times +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC) AS stop_sequence_consec, + ${has_shape_dist_traveled ? `` : `NULL AS shape_dist_traveled,`} + ${has_timepoint ? `` : `NULL AS timepoint,`} + * + REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + -- todo: what if these are NULL? + enum_range(NULL::pickup_drop_off_type)[drop_off_type + 1] AS drop_off_type, + enum_range(NULL::pickup_drop_off_type)[pickup_type + 1] AS pickup_type + ${has_timepoint ? `,enum_range(NULL::timepoint_v)[timepoint + 1] AS timepoint` : ''} ) - ORDER BY arrival_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_min ( - _timestamp TIMESTAMP WITH TIME ZONE -) -RETURNS date AS $$ - SELECT date_trunc( - 'day', - _timestamp - - GREATEST( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() +FROM read_csv( + '${pathToStopTimes}', + header = true, + all_varchar = true, + types = { + arrival_time: 'INTERVAL', + departure_time: 'INTERVAL', + stop_sequence: 'INTEGER', + pickup_type: 'INTEGER', + drop_off_type: 'INTEGER', + ${has_shape_dist_traveled ? `shape_dist_traveled: 'REAL',` : ``} + ${has_timepoint ? `timepoint: 'INTEGER',` : ``} + } +); + +-- todo: are all of them beneficial/necessary? +CREATE INDEX stop_times_trip_id ON "${opt.schema}".stop_times (trip_id); +CREATE INDEX stop_times_stop_id ON "${opt.schema}".stop_times (stop_id); +CREATE INDEX stop_times_stop_sequence_consec ON "${opt.schema}".stop_times (stop_sequence_consec); +CREATE INDEX stop_times_trip_id_stop_sequence_consec ON "${opt.schema}".stop_times (trip_id, stop_sequence_consec); +-- As of DuckDB v1.0.0, indexes on INTERVAL columns are not supported yet. +-- todo: alternatively just change these columns to INTEGER? +-- CREATE INDEX stop_times_arrival_time ON "${opt.schema}".stop_times (arrival_time); +-- CREATE INDEX stop_times_departure_time ON "${opt.schema}".stop_times (departure_time); + +-- todo: materialize? +CREATE OR REPLACE VIEW "${opt.schema}".largest_arr_dep_time AS +WITH + largest_departure_time AS ( + SELECT departure_time + FROM "${opt.schema}".stop_times + WHERE EXISTS ( + SELECT * + FROM "${opt.schema}".trips + JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id ) - -- we assume the DST <-> standard time shift is always <= 1h - - '1 hour 1 second'::interval - ); -$$ LANGUAGE SQL IMMUTABLE; --- This function doesn't do much, we just provide it to match date_filter_min(). -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_max ( - _timestamp TIMESTAMP WITH TIME ZONE -) -RETURNS date AS $$ - SELECT date_trunc('day', _timestamp); -$$ LANGUAGE SQL IMMUTABLE; + ORDER BY departure_time DESC + LIMIT 1 + ), + largest_arrival_time AS ( + SELECT arrival_time + FROM "${opt.schema}".stop_times + WHERE EXISTS ( + SELECT * + FROM "${opt.schema}".trips + JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY arrival_time DESC + LIMIT 1 + ) +SELECT + to_seconds(greatest( + epoch(arrival_time), + epoch(departure_time) + )) AS largest +FROM largest_departure_time, largest_arrival_time; + +-- CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_min ( +-- _timestamp TIMESTAMP WITH TIME ZONE +-- ) +-- RETURNS date AS $$ +-- SELECT date_trunc( +-- 'day', +-- _timestamp +-- - GREATEST( +-- "${opt.schema}".largest_arrival_time(), +-- "${opt.schema}".largest_departure_time() +-- ) +-- -- we assume the DST <-> standard time shift is always <= 1h +-- - '1 hour 1 second'::interval +-- ); +-- $$ LANGUAGE SQL IMMUTABLE; +-- -- This function doesn't do much, we just provide it to match date_filter_min(). +-- CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_max ( +-- _timestamp TIMESTAMP WITH TIME ZONE +-- ) +-- RETURNS date AS $$ +-- SELECT date_trunc('day', _timestamp); +-- $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( @@ -241,73 +249,67 @@ WITH stop_times_based AS NOT MATERIALIZED ( -- stop_times-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as arrival_departure_id, - stop_times_based.*, -- todo: expose local arrival/departure "wall clock time"? -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as arrival_departure_id, - * -FROM ( -SELECT - *, - row_number() OVER (PARTITION BY trip_id, "date", stop_sequence)::integer AS frequencies_it -FROM ( -SELECT - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - agency_id, - route_id, route_short_name, route_long_name, route_type, - trip_id, direction_id, trip_headsign, wheelchair_accessible, bikes_allowed, - service_id, - shape_id, - "date", - stop_sequence, stop_sequence_consec, - stop_headsign, pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_arrival - stop_times_offset + start_time, - t_arrival - stop_times_offset + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - departure_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_departure - stop_times_offset + start_time, - t_departure - stop_times_offset + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - stop_id, stop_name, - station_id, station_name, - wheelchair_boarding, - frequencies_row + + -- todo + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + stop_times_offset, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_arrival - stop_times_offset + start_time, + t_arrival - stop_times_offset + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival, + unnest(generate_series( + t_departure - stop_times_offset + start_time, + t_departure - stop_times_offset + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + ) FROM ( SELECT stop_times_based.*, @@ -332,34 +334,19 @@ FROM ( ) t ) frequencies_based; -CREATE OR REPLACE FUNCTION "${opt.schema}".arrival_departure_by_arrival_departure_id(id TEXT) -RETURNS "${opt.schema}".arrivals_departures -AS $$ - SELECT * - FROM "${opt.schema}".arrivals_departures - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; - -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.station_name IS E'@omit'; --- > If you want to rename just one field or type, your best bet is to use a [@name] smart comment […]. --- > NOTE: this still uses the inflectors, but it pretends that the tables name is different, so the input to the inflectors differs. --- https://www.graphile.org/postgraphile/inflection/#overriding-naming---one-off -COMMENT ON VIEW "${opt.schema}".arrivals_departures IS E'@name arrival_departures\\n@primaryKey trip_id,date,stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (stop_id) references stops|@fieldName stop\\n@foreignKey (station_id) references stops|@fieldName station'; -` : ''} +-- CREATE OR REPLACE FUNCTION "${opt.schema}".arrival_departure_by_arrival_departure_id(id TEXT) +-- RETURNS "${opt.schema}".arrivals_departures +-- AS $$ +-- SELECT * +-- FROM "${opt.schema}".arrivals_departures +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; CREATE OR REPLACE VIEW "${opt.schema}".connections AS WITH stop_times_based AS NOT MATERIALIZED ( @@ -489,89 +476,65 @@ WITH stop_times_based AS NOT MATERIALIZED ( -- stop_times-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as connection_id, - stop_times_based.*, - -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as connection_id, - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, route_short_name, route_long_name, route_type, - trip_id, - service_id, - direction_id, - trip_headsign, - wheelchair_accessible, - bikes_allowed, - - from_stop_id, - from_stop_name, - from_station_id, - from_station_name, - from_wheelchair_boarding, - - from_stop_headsign, - from_pickup_type, - generate_series( - t_departure - stop_times_offset + start_time, - t_departure - stop_times_offset + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_stop_sequence_consec, - from_timepoint, - - "date", - - to_timepoint, - to_stop_sequence, - to_stop_sequence_consec, - generate_series( - t_arrival - stop_times_offset + start_time, - t_arrival - stop_times_offset + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - to_stop_headsign, - - to_stop_id, - to_stop_name, - to_station_id, - to_station_name, - to_wheelchair_boarding, - - frequencies_row, - frequencies_it + -- todo + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + stop_times_offset, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_departure - stop_times_offset + start_time, + t_departure - stop_times_offset + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + unnest(generate_series( + t_arrival - stop_times_offset + start_time, + t_arrival - stop_times_offset + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival + ) FROM ( SELECT stop_times_based.*, @@ -591,42 +554,22 @@ FROM ( ) frequencies ON frequencies.trip_id = stop_times_based.trip_id ) frequencies_based; -CREATE OR REPLACE FUNCTION "${opt.schema}".connection_by_connection_id(id TEXT) -RETURNS "${opt.schema}".connections -AS $$ - SELECT * - FROM "${opt.schema}".connections - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND from_stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; - -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) --- todo: allow filtering based on stop and/or route and/or trip and/or time frame --- https://www.graphile.org/postgraphile/functions/#setof-functions---connections -COMMENT ON COLUMN "${opt.schema}".connections.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_station_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_station_name IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".connections IS E'@primaryKey trip_id,date,from_stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (from_stop_id) references stops|@fieldName fromStop\\n@foreignKey (from_station_id) references stops|@fieldName fromStation\\n@foreignKey (to_stop_id) references stops|@fieldName toStop\\n@foreignKey (to_station_id) references stops|@fieldName toStation'; -` : ''} -` - - - - -module.exports = { - beforeAll, - formatRow: formatStopTimesRow, - afterAll, +-- CREATE OR REPLACE FUNCTION "${opt.schema}".connection_by_connection_id(id TEXT) +-- RETURNS "${opt.schema}".connections +-- AS $$ +-- SELECT * +-- FROM "${opt.schema}".connections +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND from_stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; +`) + + workingState.nrOfRowsByName.set('stop_times', await queryNumberOfRows(db, 'stop_times', opt)) } + +module.exports = importData diff --git a/lib/stops.js b/lib/stops.js index 0a629d8..06443f6 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -1,7 +1,11 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#stopstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToStops, opt, workingState) => { + await db[RUN](`\ CREATE TYPE "${opt.schema}".location_type_val AS ENUM ( 'stop' -- 0 (or blank): Stop (or Platform). A location where passengers board or disembark from a transit vehicle. Is called a platform when defined within a parent_station. , 'station' -- 1 – Station. A physical structure or area that contains one or more platform. @@ -32,95 +36,104 @@ CREATE TYPE "${opt.schema}".wheelchair_boarding_val AS ENUM ( ); CREATE CAST ("${opt.schema}".wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; +INSTALL spatial; -- todo: make install optional? +LOAD spatial; + CREATE TABLE "${opt.schema}".stops ( stop_id TEXT PRIMARY KEY, stop_code TEXT, -- todo: Required for locations which are stops (location_type=0), stations (location_type=1) or entrances/exits (location_type=2). Optional for locations which are generic nodes (location_type=3) or boarding areas (location_type=4). stop_name TEXT, stop_desc TEXT, - stop_loc geography(POINT), -- stop_lat/stop_lon + stop_loc GEOMETRY, -- stop_lat/stop_lon zone_id TEXT, stop_url TEXT, location_type "${opt.schema}".location_type_val, parent_station TEXT, - stop_timezone TEXT CHECK ("${opt.schema}".is_timezone(stop_timezone)), + -- In stops.txt, *any* row's parent_station might reference *any* other row. Essentially, stops.txt describes a tree. + -- As of DuckDB v1.0.0, it *seems* like adding a foreign key constraint here doesn't work, even if we order the stops to put parents before their children (see below). + -- todo: Report this with DuckDB? Alternatively, add the constraint after the import (see below). + -- FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops, + stop_timezone TEXT, + FOREIGN KEY (stop_timezone) REFERENCES "${opt.schema}".valid_timezones, wheelchair_boarding "${opt.schema}".wheelchair_boarding_val, level_id TEXT, ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES "${opt.schema}".levels,`} platform_code TEXT ); -COPY "${opt.schema}".stops ( - stop_id, - stop_code, - stop_name, - stop_desc, - stop_loc, - zone_id, - stop_url, - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -) FROM STDIN csv; -` +INSERT INTO "${opt.schema}".stops +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +WITH RECURSIVE + stops AS ( + SELECT + ST_Point(stop_lon, stop_lat) AS stop_loc, + * + EXCLUDE ( + stop_lat, stop_lon + ) + REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::location_type_val)[location_type + 1] AS location_type, + enum_range(NULL::wheelchair_boarding_val)[ifnull(wheelchair_boarding, 0) + 1] AS wheelchair_boarding + ) + FROM read_csv( + '${pathToStops}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + -- todo: all_varchar = true, types + types = { + stop_code: 'TEXT', + platform_code: 'TEXT', + } + ) + ), + -- order the stops to put parents before their children + stops_sorted_by_parents AS ( + ( + SELECT + *, + stop_id AS root_id, + 1 AS recursion_level + FROM stops + WHERE parent_station IS NULL + ) + UNION ALL + ( + SELECT + children.*, + parent.root_id, + parent.recursion_level + 1 + FROM stops children + JOIN stops_sorted_by_parents parent ON parent.stop_id = children.parent_station + ) + ) +SELECT * EXCLUDE ( + -- omit sorting helper columns + root_id, + recursion_level +) +FROM stops_sorted_by_parents +ORDER BY root_id, recursion_level, stop_id; -const locationType = (val) => { - if (val === '0') return 'stop' - if (val === '1') return 'station' - if (val === '2') return 'entrance_exit' - if (val === '3') return 'node' - if (val === '4') return 'boarding_area' - throw new Error('invalid/unsupported location_type: ' + val) -} +-- todo: DuckDB v1.0.0 doesn't support them yet: +-- > The ADD CONSTRAINT and DROP CONSTRAINT clauses are not yet supported in DuckDB. +-- ALTER TABLE "${opt.schema}".stops +-- ADD CONSTRAINT stops_parent_station_fkey +-- FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops; -const wheelchairBoarding = (val) => { - if (val === '0') return 'no_info_or_inherit' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid/unsupported wheelchair_boarding: ' + val) -} +CREATE INDEX stops_parent_station ON "${opt.schema}".stops (parent_station); +${opt.stopsLocationIndex ? `CREATE INDEX stops_stop_loc ON "${opt.schema}".stops (stop_loc);` : ''} +`) -const formatStopsRow = (s) => { - return [ - s.stop_id || null, - s.stop_code || null, - s.stop_name || null, - s.stop_desc || null, - `POINT(${parseFloat(s.stop_lon)} ${parseFloat(s.stop_lat)})`, - s.zone_id || null, - s.stop_url || null, - s.location_type - ? locationType(s.location_type) - : null, - s.parent_station || null, - s.stop_timezone || null, - s.wheelchair_boarding - ? wheelchairBoarding(s.wheelchair_boarding) - : null, - s.level_id || null, - s.platform_code || null, - ] + workingState.nrOfRowsByName.set('stops', await queryNumberOfRows(db, 'stops', opt)) } -const afterAll = (opt) => `\ -\\. - -ALTER TABLE "${opt.schema}".stops -ADD CONSTRAINT stops_parent_station_fkey -FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops; - -CREATE INDEX ON "${opt.schema}".stops (parent_station); -${opt.stopsLocationIndex ? `CREATE INDEX ON "${opt.schema}".stops (stop_loc);` : ''} -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".stops (level_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatStopsRow, - afterAll, -} +module.exports = importData diff --git a/lib/transfers.js b/lib/transfers.js index dd29566..bdd1793 100644 --- a/lib/transfers.js +++ b/lib/transfers.js @@ -1,7 +1,34 @@ 'use strict' +const GET = require('./get.js') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#transferstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToTransfers, opt, workingState) => { + // min_transfer_time is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const [ + {has_min_transfer_time}, + ] = await db[GET](`\ + SELECT EXISTS( + SELECT * + FROM ( + DESCRIBE ( + SELECT * + FROM read_csv( + '${pathToTransfers}', + header = true + ) + LIMIT 1 + ) + ) columns + WHERE column_name = 'min_transfer_time' + ) AS has_min_transfer_time +`) + + await db[RUN](`\ CREATE TYPE "${opt.schema}".transfer_type_v AS ENUM ( 'recommended' -- 0 or empty - Recommended transfer point between routes. , 'timed' -- 1 - Timed transfer point between two routes. The departing vehicle is expected to wait for the arriving one and leave sufficient time for a rider to transfer between routes. @@ -11,7 +38,6 @@ CREATE TYPE "${opt.schema}".transfer_type_v AS ENUM ( CREATE CAST ("${opt.schema}".transfer_type_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE "${opt.schema}".transfers ( - id SERIAL PRIMARY KEY, from_stop_id TEXT, FOREIGN KEY (from_stop_id) REFERENCES "${opt.schema}".stops, to_stop_id TEXT, @@ -25,67 +51,39 @@ CREATE TABLE "${opt.schema}".transfers ( from_trip_id TEXT, FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips, to_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips + FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips, + -- We're not using a primary key index here because several columns can be NULL. + UNIQUE ( + from_stop_id, + from_trip_id, + from_route_id, + to_stop_id, + to_trip_id, + to_route_id + ) ); -ALTER TABLE "${opt.schema}".transfers -ADD CONSTRAINT transfers_sig -UNIQUE ( - from_stop_id, - to_stop_id, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id +INSERT INTO "${opt.schema}".transfers +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::transfer_type_v)[transfer_type + 1] AS transfer_type +) +FROM read_csv( + '${pathToTransfers}', + header = true, + all_varchar = true, + types = { + transfer_type: 'INTEGER' + ${has_min_transfer_time ? `, min_transfer_time: 'INTEGER'` : ``} + } ); +`) -COPY "${opt.schema}".transfers ( - from_stop_id, - to_stop_id, - transfer_type, - min_transfer_time, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id -) FROM STDIN csv; -` - -const transferType = (val) => { - if (val === '0') return 'recommended' - if (val === '1') return 'timed' - if (val === '2') return 'minimum_time' - if (val === '3') return 'impossible' - throw new Error('invalid/unsupported transfer_type: ' + val) + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } -const formatTransfersRow = (t) => { - return [ - t.from_stop_id || null, - t.to_stop_id || null, - t.transfer_type ? transferType(t.transfer_type) : null, - t.min_transfer_time ? parseInt(t.min_transfer_time) : null, - t.from_route_id, - t.to_route_id, - t.from_trip_id, - t.to_trip_id, - ] -} - -const afterAll = (opt) => `\ -\\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".transfers (from_route_id); -CREATE INDEX ON "${opt.schema}".transfers (from_trip_id); -CREATE INDEX ON "${opt.schema}".transfers (to_stop_id); -CREATE INDEX ON "${opt.schema}".transfers (to_route_id); -CREATE INDEX ON "${opt.schema}".transfers (to_trip_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatTransfersRow, - afterAll, -} +module.exports = importData diff --git a/lib/translations.js b/lib/translations.js index 0318357..0f2a97c 100644 --- a/lib/translations.js +++ b/lib/translations.js @@ -1,789 +1,732 @@ 'use strict' +const {strictEqual, ok} = require('assert') +// const GET = require('./get.js') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + +// > ## record_id +// > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: +// > - agency_id for agency +// > - stop_id for stops +// > - route_id for routes +// > - trip_id for trips +// > - trip_id for stop_times +// > - pathway_id for pathways +// > - level_id for levels +// > - attribution_id for attribution +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: +// > - service_id for calendar +// > - service_id for calendar_dates +// > - fare_id for fare_attributes +// > - fare_id for fare_rules +// > - shape_id for shapes +// > - trip_id for frequencies +// > - from_stop_id for transfers +// > ## record_sub_id +// > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: +// > - None for agency.txt +// > - None for stops.txt +// > - None for routes.txt +// > - None for trips.txt +// > - stop_sequence for stop_times.txt +// > - None for pathways.txt +// > - None for levels.txt +// > - None for attributions.txt +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: +// > - None for calendar.txt +// > - date for calendar_dates.txt +// > - None for fare_attributes.txt +// > - route_id for fare_rules.txt +// > - None for shapes.txt +// > - start_time for frequencies.txt +// > - to_stop_id for transfers.txt // https://gtfs.org/schedule/reference/#translationstxt -const beforeAll = (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".table_exists( - t_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM pg_tables - WHERE schemaname = '${opt.schema}' - AND tablename = t_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".table_exists IS E'@omit'; -` : ''} - -CREATE OR REPLACE FUNCTION "${opt.schema}".column_exists( - t_name TEXT, - c_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM information_schema.columns - WHERE table_schema = '${opt.schema}' - AND table_name = t_name - AND column_name = c_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".column_exists IS E'@omit'; -` : ''} - -CREATE TABLE "${opt.schema}"._translations_ref_cols ( - table_name TEXT PRIMARY KEY, - -- todo: only check if columns exist when table exists? - record_id_col TEXT NOT NULL - CONSTRAINT valid_record_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".column_exists(table_name, record_id_col) - ), - record_sub_id_col TEXT - CONSTRAINT valid_record_sub_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - record_sub_id_col IS NULL - OR - "${opt.schema}".column_exists(table_name, record_sub_id_col) - ) -); -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}"._translations_ref_cols IS E'@omit'; -` : ''} - --- > ## record_id --- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: --- > - agency_id for agency --- > - stop_id for stops --- > - route_id for routes --- > - trip_id for trips --- > - trip_id for stop_times --- > - pathway_id for pathways --- > - level_id for levels --- > - attribution_id for attribution --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: --- > - service_id for calendar --- > - service_id for calendar_dates --- > - fare_id for fare_attributes --- > - fare_id for fare_rules --- > - shape_id for shapes --- > - trip_id for frequencies --- > - from_stop_id for transfers --- > ## record_sub_id --- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: --- > - None for agency.txt --- > - None for stops.txt --- > - None for routes.txt --- > - None for trips.txt --- > - stop_sequence for stop_times.txt --- > - None for pathways.txt --- > - None for levels.txt --- > - None for attributions.txt --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: --- > - None for calendar.txt --- > - date for calendar_dates.txt --- > - None for fare_attributes.txt --- > - route_id for fare_rules.txt --- > - None for shapes.txt --- > - start_time for frequencies.txt --- > - to_stop_id for transfers.txt --- https://gtfs.org/schedule/reference/#translationstxt -INSERT INTO "${opt.schema}"._translations_ref_cols ( - table_name, - record_id_col, - record_sub_id_col -) VALUES - -- todo: feed_info - ('agency', 'agency_id', NULL), - ('stops', 'stop_id', NULL), - ('routes', 'route_id', NULL), - ('trips', 'trip_id', NULL), - ('stop_times', 'trip_id', 'stop_sequence'), - ('pathways', 'pathway_id', NULL), - ('levels', 'level_id', NULL), - ('attribution', 'attribution_id', NULL), - ('calendar', 'service_id', NULL), - ('calendar_dates', 'service_id', 'date'), - ('fare_attributes', 'fare_id', NULL), - ('fare_rules', 'fare_id', 'route_id'), - ('shapes', 'shape_id', NULL), - ('frequencies', 'trip_id', 'start_time'), - ('transfers', 'from_stop_id', 'to_stop_id') -; - -CREATE OR REPLACE FUNCTION "${opt.schema}".row_exists( - table_name TEXT, - col_a_name TEXT, - col_a_value TEXT, - col_b_name TEXT, - col_b_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - result BOOLEAN; - BEGIN - IF col_b_name IS NULL THEN - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - LIMIT 1 - ) - ', '${opt.schema}', table_name, col_a_name, col_a_value) - INTO STRICT result; - RETURN result; - ELSE - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - AND %I = %L -- col_b_name, col_b_value - LIMIT 1 - ) - ', '${opt.schema}', table_name, col_a_name, col_a_value, col_b_name, col_b_value) - INTO STRICT result; - RETURN result; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".row_exists IS E'@omit'; -` : ''} - --- todo: assert that row_exists works as intended --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_id', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_name', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- todo: assert that it fails with 2 rows - -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_translation_ref( - _table_name TEXT, - _field_name TEXT, - _record_id TEXT, - _record_sub_id TEXT, - _field_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - _record_id_col TEXT; - _record_sub_id_col TEXT; - result BOOLEAN; - BEGIN - IF _record_id IS NOT NULL THEN - SELECT record_id_col - FROM "${opt.schema}"._translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_id_col; - SELECT record_sub_id_col - FROM "${opt.schema}"._translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_sub_id_col; - - IF _record_sub_id_col IS NULL AND _record_sub_id IS NOT NULL THEN - RAISE EXCEPTION - USING - MESSAGE = format('record_sub_id must be NULL for %I but is %L', _table_name, _record_sub_id), - ERRCODE = 'data_exception'; - END IF; - SELECT "${opt.schema}".row_exists( - _table_name, - _record_id_col, _record_id, - _record_sub_id_col, _record_sub_id - ) - INTO STRICT result; - RETURN result; - ELSEIF _field_value IS NOT NULL THEN - SELECT "${opt.schema}".row_exists( - _table_name, - _field_name, _field_value, - NULL, NULL - ) - INTO STRICT result; - RETURN result; - ELSE - RAISE EXCEPTION - USING - MESSAGE = 'Either record_id or field_value must be NOT NULL', - HINT = 'Refer to translations.txt the GTFS Static/Schedule reference.', - ERRCODE = 'data_exception'; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_translation_ref IS E'@omit'; -` : ''} - --- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". --- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 --- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html --- related: https://github.com/google/transit/pull/98 - --- https://gtfs.org/schedule/reference/#translationstxt -CREATE TABLE "${opt.schema}".translations ( - -- > Defines the table that contains the field to be translated. Allowed values are: - -- > agency, stops, routes, trips, stop_times, pathways, levels, feed_info, attributions - -- > Any file added to GTFS will have a table_name value equivalent to the file name, as listed above (i.e., not including the .txt file extension). - table_name TEXT NOT NULL, - - -- > Name of the field to be translated. […] Fields with other types should not be translated. - field_name TEXT NOT NULL - CONSTRAINT valid_field_name CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".column_exists(table_name, field_name) - ), - - language TEXT NOT NULL - CONSTRAINT valid_language CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".is_valid_lang_code(language) - ), - - translation TEXT NOT NULL, - - -- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below […]. - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. […] - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if field_value is empty. - record_id TEXT, - - -- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: - -- > - None for agency.txt - -- > - None for stops.txt - -- > - None for routes.txt - -- > - None for trips.txt - -- > - stop_sequence for stop_times.txt - -- > - None for pathways.txt - -- > - None for levels.txt - -- > - None for attributions.txt - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: - -- > - None for calendar.txt - -- > - date for calendar_dates.txt - -- > - None for fare_attributes.txt - -- > - route_id for fare_rules.txt - -- > - None for shapes.txt - -- > - start_time for frequencies.txt - -- > - to_stop_id for transfers.txt - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if table_name=stop_times and record_id is defined. - record_sub_id TEXT, - - -- > Instead of […] using record_id and record_sub_id, this field can be used […]. When used, the translation will be applied when the fields identified by table_name and field_name contains the exact same value defined in field_value. - -- > The field must have exactly the value defined in field_value. If only a subset of the value matches field_value, the translation won’t be applied. - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if record_id is defined. - -- > - Required if record_id is empty. - -- todo: - -- > If two translation rules match the same record (one with field_value, and the other one with record_id), the rule with record_id takes precedence. - field_value TEXT, - - CONSTRAINT field_value_or_record_id CHECK ( - field_value IS NULL OR record_id IS NULL - ), - CONSTRAINT not_with_feed_info CHECK ( - field_value IS NULL OR table_name != 'feed_info' - ), +const supportedTranslationRefs = new Map([ + ['agency', { + src_table_name: 'agency', + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stops', { + src_table_name: 'stops', + record_id_column: 'stop_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['routes', { + src_table_name: 'routes', + record_id_column: 'route_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['trips', { + src_table_name: 'trips', + record_id_column: 'trip_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stop_times', { + src_table_name: 'stop_times', + record_id_column: 'trip_id', + record_sub_id_column: 'stop_sequence', record_sub_id_column_type: 'INTEGER', + }], + ['pathways', { + src_table_name: 'pathways', + record_id_column: 'pathway_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['levels', { + src_table_name: 'levels', + record_id_column: 'level_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // todo: attribution.txt is not supported yet + // ['attribution', { + // src_table_name: 'attribution', + // record_id_column: 'attribution_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + ['calendar', { + src_table_name: 'calendar', + record_id_column: 'service_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['calendar_dates', { + src_table_name: 'calendar_dates', + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // ['fare_attributes', { + // src_table_name: 'fare_attributes', + // record_id_column: 'fare_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + // ['fare_rules', { + // src_table_name: 'fare_rules', + // record_id_column: 'fare_id', + // record_sub_id_column: 'route_id', record_sub_id_column_type: 'TEXT', + // }], + ['shapes', { + src_table_name: 'shapes', + record_id_column: 'shape_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // frequencies.txt has no primary key and/or unique index yet because DuckDB doesn't support indexes on INTERVAL. See frequencies.js for more details. + // ['frequencies', { + // src_table_name: 'frequencies', + // record_id_column: 'trip_id', + // record_sub_id_column: 'start_time', record_sub_id_column_type: 'INTERVAL', + // }], + // transfers' rows are *not* unique on (from_stop_id, to_stop_id), so we cannot create a foreign key reference on the table. + // todo: find a workaround + // ['transfers', { + // src_table_name: 'transfers', + // record_id_column: 'from_stop_id', + // record_sub_id_column: 'to_stop_id', record_sub_id_column_type: 'TEXT', + // }], + ['feed_info', { + src_table_name: 'feed_info', + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }], +]) + +const _srcTableRefSql = (schema, table_name) => { + return `"${schema}"."_translations_${table_name}"` +} - CONSTRAINT valid_reference CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - table_name = 'feed_info' - OR - "${opt.schema}".is_valid_translation_ref( - table_name, - field_name, - record_id, - record_sub_id, - field_value +const _srcTablesSql = (pathToTranslations, schema, table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, record_sub_id_column_type, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const tableRef = `"${schema}"."${table_name}"` + const srcTableRef = _srcTableRefSql(schema, table_name) + + return `\ +CREATE TABLE ${srcTableRef} ( + ${hasCol ? `record_id TEXT NOT NULL,` : ``} + ${hasSubCol ? `record_sub_id ${record_sub_id_column_type} NOT NULL,` : ``} +${hasCol ? `\ + FOREIGN KEY ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} ) - ), - - -- > Primary key (table_name, field_name, language, record_id, record_sub_id, field_value) - -- https://gtfs.org/schedule/reference/#translationstxt - -- PostgreSQL doesn't allow NULL values for primary key columns, so we use UNIQUE. - UNIQUE ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value - ) + REFERENCES ${tableRef} ( + ${colRef} + ${hasSubCol ? `, ${subColRef}` : ``} + ),\ +` : ``} + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); - -COPY "${opt.schema}".translations ( - table_name, +INSERT INTO ${srcTableRef} +SELECT + ${hasCol ? `record_id,` : ``} + ${hasSubCol ? `record_sub_id,` : ``} field_name, language, - translation, - record_id, - record_sub_id, - field_value -) FROM STDIN csv; + translation +FROM read_csv( + '${pathToTranslations}', + header = true, + all_varchar = true +) +WHERE table_name = '${table_name}' +-- todo: support field_value-based translations +AND field_value IS NULL; ` - -const formatTranslationsRow = (t) => { - return [ - t.table_name || null, - t.field_name || null, - t.language || null, - t.translation || null, - t.record_id || null, - t.record_sub_id || null, - t.field_value || null, - ] } - -const afterAll = (opt) => `\ -\\. - --- todo -CREATE INDEX ON "${opt.schema}".translations ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value +strictEqual( + _srcTablesSql('foo/trans.txt', 'gtfs', 'feed_info', { + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ +CREATE TABLE "gtfs"."_translations_feed_info" ( + + + + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); - -CREATE OR REPLACE VIEW "${opt.schema}".stops_translated AS +INSERT INTO "gtfs"."_translations_feed_info" SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - stop_id, - stop_code, - coalesce(stop_n_t.translation, stop_name) as stop_name, - stop_n_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(stop_d_t.translation, stop_desc) as stop_desc, - stop_d_t.language as stop_desc_lang, -- todo: fall back to feed_info.feed_lang? - stop_loc, - zone_id, - coalesce(stop_u_t.translation, stop_url) as stop_url, - stop_u_t.language as stop_url_lang, -- todo: fall back to feed_info.feed_lang? - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -FROM "${opt.schema}".stops s -LEFT JOIN "${opt.schema}".translations stop_n_t ON ( - stop_n_t.table_name = 'stops' AND stop_n_t.field_name = 'stop_name' - AND (s.stop_id = stop_n_t.record_id OR s.stop_name = stop_n_t.field_value) + + + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN "${opt.schema}".translations stop_d_t ON ( - stop_d_t.table_name = 'stops' AND stop_d_t.field_name = 'stop_desc' - AND (s.stop_id = stop_d_t.record_id OR s.stop_name = stop_d_t.field_value) +WHERE table_name = 'feed_info' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with feed_info.txt', ) -LEFT JOIN "${opt.schema}".translations stop_u_t ON ( - stop_u_t.table_name = 'stops' AND stop_u_t.field_name = 'stop_url' - AND (s.stop_id = stop_u_t.record_id OR s.stop_name = stop_u_t.field_value) +strictEqual( + _srcTablesSql('foo/trans.txt', 'gtfs', 'calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ +CREATE TABLE "gtfs"."_translations_calendar_dates" ( + record_id TEXT NOT NULL, + record_sub_id DATE NOT NULL, + FOREIGN KEY ( + record_id + , record_sub_id + ) + REFERENCES "gtfs"."calendar_dates" ( + "service_id" + , "date" + ), + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".stops_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".stops_translated_stop_name ( - stop stops, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, stops.stop_name) - FROM "${opt.schema}".stops - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'stops' AND t.field_name = 'stop_name' - AND (stops.stop_id = t.record_id OR stops.stop_name = t.field_value) - ) - WHERE stops.stop_id = stop.stop_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".routes_translated AS +INSERT INTO "gtfs"."_translations_calendar_dates" SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - agency_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_d_t.translation, route_desc) as route_desc, - route_d_t.language as route_desc_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - coalesce(route_u_t.translation, route_url) as route_url, - route_u_t.language as route_url_lang, -- todo: fall back to feed_info.feed_lang? - route_color, - route_text_color, - route_sort_order -FROM "${opt.schema}".routes r -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (r.route_id = route_s_t.record_id OR r.route_short_name = route_s_t.field_value) + record_id, + record_sub_id, + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (r.route_id = route_l_t.record_id OR r.route_long_name = route_l_t.field_value) +WHERE table_name = 'calendar_dates' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with calendar_dates.txt', ) -LEFT JOIN "${opt.schema}".translations route_d_t ON ( - route_d_t.table_name = 'routes' AND route_d_t.field_name = 'route_desc' - AND (r.route_id = route_d_t.record_id OR r.route_long_name = route_d_t.field_value) + +const _selectToBeMergedSql = (schema, table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const hasSubCol = record_sub_id_column !== null + const srcTableRef = _srcTableRefSql(schema, table_name) + + return `\ + SELECT + '${table_name}' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + ${hasCol ? `record_id::TEXT as record_id,` : ``} + ${hasSubCol ? `record_sub_id::TEXT as record_sub_id,` : ``} + * + ${hasCol ? `EXCLUDE ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} + )` : ``} + FROM ${srcTableRef} +` +} +strictEqual( + _selectToBeMergedSql('gtfs', 'agency', { + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ + SELECT + 'agency' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + + * + EXCLUDE ( + record_id + + ) + FROM "gtfs"."_translations_agency" +`, + '_selectToBeMergedSql with agency.txt', +) +strictEqual( + _selectToBeMergedSql('gtfs', 'calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ + SELECT + 'calendar_dates' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + record_sub_id::TEXT as record_sub_id, + * + EXCLUDE ( + record_id + , record_sub_id + ) + FROM "gtfs"."_translations_calendar_dates" +`, + '_selectToBeMergedSql with calendar_dates.txt', ) -LEFT JOIN "${opt.schema}".translations route_u_t ON ( - route_u_t.table_name = 'routes' AND route_u_t.field_name = 'route_url' - AND (r.route_id = route_u_t.record_id OR r.route_long_name = route_u_t.field_value) -); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".routes_translated IS E'@omit'; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_short_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_short_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_short_name' - AND (routes.route_id = t.record_id OR routes.route_short_name = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_long_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_long_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_long_name' - AND (routes.route_id = t.record_id OR routes.route_long_name = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_desc ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_desc) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_desc' - AND (routes.route_id = t.record_id OR routes.route_desc = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_url ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_url) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_url' - AND (routes.route_id = t.record_id OR routes.route_url = t.field_value) +const _translatedSql = (schema, table_name, translatedCols) => { + const tableRef = `"${schema}"."${table_name}"` + const _transRefSql = (col) => `"trans_${col}"` + + const _sqls = Array.from(translatedCols.entries()) + .map(([col, translationRef]) => { + const { + src_table_name, + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const srcTableRef = _srcTableRefSql(schema, src_table_name) + const transRef = _transRefSql(col) + + return { + colLangSelect: `\ + ${transRef}.language AS "${col}_lang",`, + colReplace: `\ + coalesce(${transRef}.translation, "${col}") AS "${col}"`, + transJoin: `\ +LEFT JOIN ${srcTableRef} ${transRef} ON ( + ${transRef}.field_name = '${col}' + ${hasCol ? `AND data.${colRef} = ${transRef}.record_id` : ``} + ${hasSubCol ? `AND data.${subColRef} = ${transRef}.record_sub_id` : ``} +)`, + } + }) + + return `\ +CREATE VIEW "${schema}"."${table_name}_translated" AS +SELECT + -- todo: fall back to feed_info.feed_lang? +${_sqls.map(sql => sql.colLangSelect).join('\n')} + data.* + REPLACE ( +${_sqls.map(sql => sql.colReplace).join(',\n')} ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - --- todo [breaking]: remove in favor of trip_headsign_translations & trip_short_name_translations -CREATE OR REPLACE VIEW "${opt.schema}".trips_translated AS +FROM ${tableRef} data +${_sqls.map(sql => sql.transJoin).join('\n')}; +` +} +{ + const agencyRef = supportedTranslationRefs.get('agency') + strictEqual( + _translatedSql('gtfs', 'agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ])), + `\ +CREATE VIEW "gtfs"."agency_translated" AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - trip_id, - route_id, - service_id, - coalesce(trip_h_t.translation, trip_headsign) as trip_headsign, - trip_h_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(trip_s_t.translation, trip_short_name) as trip_short_name, - trip_s_t.language as trip_short_name_lang, -- todo: fall back to feed_info.feed_lang? - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -FROM "${opt.schema}".trips t -LEFT JOIN "${opt.schema}".translations trip_s_t ON ( - trip_s_t.table_name = 'trips' AND trip_s_t.field_name = 'trip_short_name' - AND (t.trip_id = trip_s_t.record_id OR t.trip_headsign = trip_s_t.field_value) + -- todo: fall back to feed_info.feed_lang? + "trans_agency_name".language AS "agency_name_lang", + "trans_agency_url".language AS "agency_url_lang", + data.* + REPLACE ( + coalesce("trans_agency_name".translation, "agency_name") AS "agency_name", + coalesce("trans_agency_url".translation, "agency_url") AS "agency_url" + ) +FROM "gtfs"."agency" data +LEFT JOIN "gtfs"."_translations_agency" "trans_agency_name" ON ( + "trans_agency_name".field_name = 'agency_name' + AND data."agency_id" = "trans_agency_name".record_id + ) -LEFT JOIN "${opt.schema}".translations trip_h_t ON ( - trip_h_t.table_name = 'trips' AND trip_h_t.field_name = 'trip_headsign' - AND (t.trip_id = trip_h_t.record_id OR t.trip_headsign = trip_h_t.field_value) +LEFT JOIN "gtfs"."_translations_agency" "trans_agency_url" ON ( + "trans_agency_url".field_name = 'agency_url' + AND data."agency_id" = "trans_agency_url".record_id + ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".trips_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_short_name ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_short_name) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_short_name' - AND (trips.trip_id = t.record_id OR trips.trip_short_name = t.field_value) +`, + '_translatedSql with agency.txt', ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_headsign ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_headsign) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_headsign' - AND (trips.trip_id = t.record_id OR trips.trip_headsign = t.field_value) - ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures_translated AS +} +{ + const calendarDatesRef = supportedTranslationRefs.get('calendar_dates') + strictEqual( + _translatedSql('gtfs', 'calendar_dates', new Map([ + ['foo', calendarDatesRef], + ['b-a-r', calendarDatesRef], + ])), + `\ +CREATE VIEW "gtfs"."calendar_dates_translated" AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - service_id, - shape_id, - "date", - stop_sequence, - coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, - stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, t_arrival, - departure_time, t_departure, - stop_id, - coalesce(stop_t.translation, stop_name) as stop_name, - stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - station_id, - coalesce(station_t.translation, station_name) as station_name, - station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".arrivals_departures ad -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) -) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) -) -LEFT JOIN "${opt.schema}".translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) -) -LEFT JOIN "${opt.schema}".translations stop_t ON ( - stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' - AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) -) -LEFT JOIN "${opt.schema}".translations station_t ON ( - station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' - AND station_t.language = stop_t.language - AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) -) -LEFT JOIN "${opt.schema}".translations stop_times_t ON ( - stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' - AND ( - (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) - OR ad.stop_headsign = stop_times_t.field_value + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" ) +FROM "gtfs"."calendar_dates" data +LEFT JOIN "gtfs"."_translations_calendar_dates" "trans_foo" ON ( + "trans_foo".field_name = 'foo' + AND data."service_id" = "trans_foo".record_id + AND data."date" = "trans_foo".record_sub_id +) +LEFT JOIN "gtfs"."_translations_calendar_dates" "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + AND data."service_id" = "trans_b-a-r".record_id + AND data."date" = "trans_b-a-r".record_sub_id ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".arrivals_departures_translated IS E'@omit'; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".connections_translated AS +`, + '_translatedSql with calendar_dates.txt', + ) +} +{ + const feedInfoRef = supportedTranslationRefs.get('feed_info') + strictEqual( + _translatedSql('gtfs', 'feed_info', new Map([ + ['foo', { + ...feedInfoRef, + src_table_name: 'some-other-table', + }], + ['b-a-r', feedInfoRef], + ])), + `\ +CREATE VIEW "gtfs"."feed_info_translated" AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, - service_id, - direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - - from_stop_id, - coalesce(from_stop.translation, from_stop_name) as from_stop_name, - from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - from_station_id, - coalesce(from_station.translation, from_station_name) as from_station_name, - from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? - - coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, - from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - from_pickup_type, - t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_timepoint, - - "date", - - to_timepoint, - to_stop_sequence, - t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, - to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - - to_stop_id, - coalesce(to_stop.translation, to_stop_name) as to_stop_name, - to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - to_station_id, - coalesce(to_station.translation, to_station_name) as to_station_name, - to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".connections c -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) -) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) -) -LEFT JOIN "${opt.schema}".translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) -) -LEFT JOIN "${opt.schema}".translations from_stop ON ( - from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' - AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) -) -LEFT JOIN "${opt.schema}".translations from_station ON ( - from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' - AND from_station.language = from_stop.language - AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) -) -LEFT JOIN "${opt.schema}".translations to_stop ON ( - to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' - AND to_stop.language = from_stop.language - AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) -) -LEFT JOIN "${opt.schema}".translations to_station ON ( - to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' - AND to_station.language = from_stop.language - AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) -) -LEFT JOIN "${opt.schema}".translations from_stop_times_t ON ( - from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) - OR c.from_stop_headsign = from_stop_times_t.field_value + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" ) +FROM "gtfs"."feed_info" data +LEFT JOIN "gtfs"."_translations_some-other-table" "trans_foo" ON ( + "trans_foo".field_name = 'foo' + + ) -LEFT JOIN "${opt.schema}".translations to_stop_times_t ON ( - to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) - OR c.to_stop_headsign = to_stop_times_t.field_value - ) +LEFT JOIN "gtfs"."_translations_feed_info" "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + + ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".connections_translated IS E'@omit'; -` : ''} -` +`, + '_translatedSql with feed_info.txt', + ) +} -module.exports = { - beforeAll, - formatRow: formatTranslationsRow, - afterAll, +// https://gtfs.org/schedule/reference/#translationstxt +const importData = async (db, pathToTranslations, opt, workingState) => { + const selectsToBeMerged = [] + for (const [table_name, translationRef] of supportedTranslationRefs.entries()) { + await db[RUN](_srcTablesSql(pathToTranslations, opt.schema, table_name, translationRef)) + selectsToBeMerged.push(_selectToBeMergedSql(opt.schema, table_name, translationRef)) + } + + await db[RUN](`\ +-- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". +-- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 +-- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html +-- related: https://github.com/google/transit/pull/98 + +-- We mimick a true table with a view that UNIONs all individual _translations_* tables. +CREATE VIEW "${opt.schema}".translations AS +${selectsToBeMerged.map(sql => `(${sql})`).join(`UNION ALL BY NAME`)}; +`) + + const agencyRef = supportedTranslationRefs.get('agency') + const stopsRef = supportedTranslationRefs.get('stops') + const routesRef = supportedTranslationRefs.get('routes') + const tripsRef = supportedTranslationRefs.get('trips') + const stopTimesRef = supportedTranslationRefs.get('stop_times') + const pathwaysRef = supportedTranslationRefs.get('pathways') + const levelsRef = supportedTranslationRefs.get('levels') + const feedInfoRef = supportedTranslationRefs.get('feed_info') + const preTranslatedColumns = new Map([ + ['agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ['agency_phone', agencyRef], + ['agency_fare_url', agencyRef], + ['agency_email', agencyRef], + ])], + ['stops', new Map([ + ['stop_code', stopsRef], + ['stop_name', stopsRef], + // todo: not supported yet by stops.js + // ['tts_stop_name', stopsRef], + ['stop_desc', stopsRef], + ['stop_url', stopsRef], + ['platform_code', stopsRef], + ])], + ['routes', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['route_desc', routesRef], + ['route_url', routesRef], + ])], + ['trips', new Map([ + ['trip_headsign', tripsRef], + ['trip_short_name', tripsRef], + // todo: not supported yet by trips.js + // ['trip_desc', tripsRef], + // ['trip_url', tripsRef], + ])], + ['stop_times', new Map([ + ['stop_headsign', stopTimesRef], + ])], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // todo: frequencies.txt (see above) + // todo: areas.txt is not supported yet + // todo: networks.txt is not supported yet + ['pathways', new Map([ + ['signposted_as', pathwaysRef], + ['reversed_signposted_as', pathwaysRef], + ])], + ['levels', new Map([ + ['level_name', levelsRef], + ])], + // todo: location_groups.txt is not supported yet + // todo: booking_rules.txt is not supported yet + ['feed_info', new Map([ + ['feed_publisher_name', feedInfoRef], + ['feed_publisher_url', feedInfoRef], + ['feed_version', feedInfoRef], + ['feed_contact_email', feedInfoRef], + ['feed_contact_url', feedInfoRef], + ])], + // todo: attribution.txt is not supported yet + + ]) + for (const [table_name, translatedCols] of preTranslatedColumns) { + await db[RUN](_translatedSql(opt.schema, table_name, translatedCols)) + } + + // *_translated for tables/views made up by gtfs-via-postgres + { + await db[RUN](_translatedSql(opt.schema, 'arrivals_departures', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['trip_headsign', tripsRef], + ['stop_headsign', stopsRef], + ['stop_name', stopsRef], + // todo: ['station_name', stopsRef], + ]))) + } + // todo: connections + +// `\ +// -- CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- service_id, +// -- shape_id, +// -- "date", +// -- stop_sequence, +// -- coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, +// -- stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- pickup_type, drop_off_type, shape_dist_traveled, timepoint, +// -- tz, +// -- arrival_time, t_arrival, +// -- departure_time, t_departure, +// -- stop_id, +// -- coalesce(stop_t.translation, stop_name) as stop_name, +// -- stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- station_id, +// -- coalesce(station_t.translation, station_name) as station_name, +// -- station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM "${opt.schema}".arrivals_departures ad +// -- LEFT JOIN "${opt.schema}".translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations stop_t ON ( +// -- stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' +// -- AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations station_t ON ( +// -- station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' +// -- AND station_t.language = stop_t.language +// -- AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations stop_times_t ON ( +// -- stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) +// -- OR ad.stop_headsign = stop_times_t.field_value +// -- ) +// -- ); +// -- +// -- CREATE OR REPLACE VIEW "${opt.schema}".connections_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, +// -- service_id, +// -- direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- from_stop_id, +// -- coalesce(from_stop.translation, from_stop_name) as from_stop_name, +// -- from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_station_id, +// -- coalesce(from_station.translation, from_station_name) as from_station_name, +// -- from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, +// -- from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_pickup_type, +// -- t_departure, +// -- departure_time, -- todo [breaking]: this is misleading, remove it +// -- from_stop_sequence, +// -- from_timepoint, +// -- +// -- "date", +// -- +// -- to_timepoint, +// -- to_stop_sequence, +// -- t_arrival, +// -- arrival_time, -- todo [breaking]: this is misleading, remove it +// -- to_drop_off_type, +// -- coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, +// -- to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- to_stop_id, +// -- coalesce(to_stop.translation, to_stop_name) as to_stop_name, +// -- to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- to_station_id, +// -- coalesce(to_station.translation, to_station_name) as to_station_name, +// -- to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM "${opt.schema}".connections c +// -- LEFT JOIN "${opt.schema}".translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations from_stop ON ( +// -- from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' +// -- AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations from_station ON ( +// -- from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' +// -- AND from_station.language = from_stop.language +// -- AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations to_stop ON ( +// -- to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' +// -- AND to_stop.language = from_stop.language +// -- AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations to_station ON ( +// -- to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' +// -- AND to_station.language = from_stop.language +// -- AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations from_stop_times_t ON ( +// -- from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) +// -- OR c.from_stop_headsign = from_stop_times_t.field_value +// -- ) +// -- ) +// -- LEFT JOIN "${opt.schema}".translations to_stop_times_t ON ( +// -- to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) +// -- OR c.to_stop_headsign = to_stop_times_t.field_value +// -- ) +// -- ); +// `; + + workingState.nrOfRowsByName.set('translations', await queryNumberOfRows(db, 'translations', opt)) } + +module.exports = importData diff --git a/lib/trips.js b/lib/trips.js index 9b8beb0..b27efe6 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -1,7 +1,11 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/schedule/reference/#tripstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToTrips, opt, workingState) => { + await db[RUN](`\ CREATE TYPE "${opt.schema}".wheelchair_accessibility AS ENUM ( 'unknown' -- 0 or empty - No accessibility information for the trip. , 'accessible' -- 1 – Vehicle being used on this particular trip can accommodate at least one rider in a wheelchair. @@ -20,75 +24,42 @@ CREATE TABLE "${opt.schema}".trips ( trip_id TEXT PRIMARY KEY, route_id TEXT NOT NULL, FOREIGN KEY (route_id) REFERENCES "${opt.schema}".routes, + -- todo: add foreign key constraint? service_id TEXT NOT NULL, -- references "${opt.schema}".service_days.service_id trip_headsign TEXT, trip_short_name TEXT, direction_id INT, block_id TEXT, shape_id TEXT, -- todo: add NOT NULL? - ${opt.tripsWithoutShapeId ? '' : `CONSTRAINT valid_shape_id CHECK ("${opt.schema}".shape_exists(shape_id)),`} - -- todo [breaking]: use 0/unknown for empty values + -- ${opt.tripsWithoutShapeId ? '' : `FOREIGN KEY (shape_id) REFERENCES "${opt.schema}".shapes,`} wheelchair_accessible "${opt.schema}".wheelchair_accessibility, -- todo [breaking]: use 0/unknown for empty values bikes_allowed "${opt.schema}".bikes_allowance ); -COPY "${opt.schema}".trips ( - trip_id, - route_id, - service_id, - trip_headsign, - trip_short_name, - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -) FROM STDIN csv; -` - -const wheelchairAccessibility = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid wheelchair_accessibility: ' + val) -} - -const bikesAllowance = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'allowed' - if (val === '2') return 'not_allowed' - throw new Error('invalid bikes_allowance: ' + val) -} +INSERT INTO "${opt.schema}".trips +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::wheelchair_accessibility)[wheelchair_accessible + 1] AS wheelchair_accessible, + enum_range(NULL::bikes_allowance)[bikes_allowed + 1] AS bikes_allowed +) +FROM read_csv( + '${pathToTrips}', + header = true, + all_varchar = true, + types = { + direction_id: 'INTEGER', + wheelchair_accessible: 'INTEGER', + bikes_allowed: 'INTEGER', + } +); +`) -const formatTripsRow = (t) => { - return [ - t.trip_id || null, - t.route_id || null, - t.service_id || null, - t.trip_headsign || null, - t.trip_short_name || null, - t.direction_id ? parseInt(t.direction_id) : null, - t.block_id || null, - t.shape_id || null, - t.wheelchair_accessible - ? wheelchairAccessibility(t.wheelchair_accessible) - : null, - t.bikes_allowed ? bikesAllowance(t.bikes_allowed) : null, - ] + workingState.nrOfRowsByName.set('trips', await queryNumberOfRows(db, 'trips', opt)) } -const afterAll = (opt) => `\ -\\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".trips (route_id); -COMMENT ON TABLE "${opt.schema}".trips IS E'@foreignKey (shape_id) references shapes_aggregated|@fieldName shape'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatTripsRow, - afterAll, -} +module.exports = importData diff --git a/lib/util.js b/lib/util.js deleted file mode 100644 index affe20b..0000000 --- a/lib/util.js +++ /dev/null @@ -1,12 +0,0 @@ -'use strict' - -const parseTime = require('gtfs-utils/parse-time') - -const formatTime = (gtfsTime) => { - const {hours: h, minutes: m, seconds: s} = parseTime(gtfsTime) - return `${h} hours ${m} minutes ${s === null ? 0 : s} seconds` -} - -module.exports = { - formatTime, -} diff --git a/package.json b/package.json index 905e733..6b9b9de 100644 --- a/package.json +++ b/package.json @@ -1,4 +1,5 @@ { + "private": true, "name": "gtfs-via-postgres", "description": "Process GTFS using PostgreSQL.", "version": "4.10.2", @@ -55,25 +56,17 @@ "node": ">=16.17" }, "dependencies": { - "csv-stringify": "^6.2.0", "debug": "^4.3.3", + "duckdb": "^1.0.0", "gtfs-utils": "^5.1.0", "sequencify": "0.0.7" }, "devDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", "csv-parser": "^3.0.0", "eslint": "^8.33.0", "pkg": "^5.3.2", - "postgraphile": "^4.12.11", "sample-gtfs-feed": "^0.13.0" }, - "peerDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", - "postgraphile": "^4.12.11" - }, "scripts": { "test": "./test/index.sh", "lint": "eslint .", diff --git a/readme.md b/readme.md index 44c87b2..5fb62b3 100644 --- a/readme.md +++ b/readme.md @@ -59,6 +59,10 @@ ls -lh gtfs # 16M trips.csv ``` +```shell +duckdb_cli -c 'INSTALL ICU' +``` + Depending on your specific setup, configure access to the PostgreSQL database via [`PG*` environment variables](https://www.postgresql.org/docs/14/libpq-envars.html): ```sh diff --git a/scripts/run-postgraphile.js b/scripts/run-postgraphile.js deleted file mode 100755 index bf74ee8..0000000 --- a/scripts/run-postgraphile.js +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env node - -const {createServer} = require('http') -const {postgraphile} = require('postgraphile') -const postgisPlugin = require('@graphile/postgis').default -const simplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector') - -const DEV = process.env.NODE_ENV === 'development' -const PROD = !DEV -const PORT = process.env.PORT ? parseInt(process.env.PORT) : 3000 -const SCHEMA = process.env.PGSCHEMA || 'public' - -const pg = postgraphile({}, SCHEMA, { - appendPlugins: [ - // PostGIS support for PostGraphile - postgisPlugin, - - // Simplifies the graphile-build-pg inflector to trim the `ByFooIdAndBarId` from relations - simplifyInflectorPlugin, - ], - graphileBuildOptions: { - pgSimplifyAllRows: false, - pgShortPk: false, - }, - - pgSettings: async () => ({ - // With `timestamptz` (a.k.a. `timestamp with time zone`), PostgreSQL *doesn't* store the timezone (offset) specified on input; Instead, it always converts to UTC. - // When querying a `timestamptz` value, it converts to the local timezone (offset) of the client's session or database server. - // Because we loose the timezone offset information *anyways*, we configure PostGraphile to give predictable results by letting PostgreSQL always convert to UTC. - timezone: 'UTC', - }), - - // [Experimental] Determines if the 'Explain' feature in GraphiQL can be used to show the user the SQL statements that were executed. Set to a boolean to enable all users to use this, or to a function that filters each request to determine if the request may use Explain. DO NOT USE IN PRODUCTION unless you're comfortable with the security repurcussions of doing so. - allowExplain: DEV, - - // Enables classic ids for Relay support. Instead of using the field name nodeId for globally unique ids, PostGraphile will instead use the field name id for its globally unique ids. This means that table id columns will also get renamed to rowId. - classicIds: true, - - // Turns off GraphQL query logging. By default PostGraphile will log every GraphQL query it processes along with some other information. Set this to true (recommended in production) to disable that feature. - disableQueryLog: PROD, - - // By default, JSON and JSONB fields are presented as strings (JSON encoded) from the GraphQL schema. Setting this to true (recommended) enables raw JSON input and output, saving the need to parse / stringify JSON manually. - dynamicJson: true, - - // Set this to true to add some enhancements to GraphiQL; intended for development usage only (automatically enables with subscriptions and live). - enhanceGraphiql: DEV, - - // Set this to true to enable the GraphiQL interface. - graphiql: true, - - // Extends the error response with additional details from the Postgres error. Can be any combination of ['hint', 'detail', 'errcode']. Default is []. - extendedErrors: DEV ? ['hint', 'detail', 'errcode'] : [], - - // Set false to exclude filters, orderBy, and relations that would be expensive to access due to missing indexes. Changing this from true to false is a breaking change, but false to true is not. The default is true. - ignoreIndexes: false, - - // Set false (recommended) to exclude fields, queries and mutations that are not available to any possible user (determined from the user in connection string and any role they can become); set this option true to skip these checks and create GraphQL fields and types for everything. The default is true, in v5 the default will change to false. - ignoreRBAC: false, - - // Some one-to-one relations were previously detected as one-to-many - should we export 'only' the old relation shapes, both new and old but mark the old ones as 'deprecated' (default), or 'omit' (recommended) the old relation shapes entirely. - legacyRelations: 'omit', - - // If none of your RETURNS SETOF compound_type functions mix NULLs with the results then you may set this false to reduce the nullables in the GraphQL schema. - setofFunctionsContainNulls: false, - - // Enables adding a stack field to the error response. Can be either the boolean true (which results in a single stack string) or the string json (which causes the stack to become an array with elements for each line of the stack). Recommended in development, not recommended in production. - showErrorStack: DEV, - - // Should we use relay pagination, or simple collections? - // "omit" (default) - // relay connections only, "only" (not recommended) - // simple collections only (no Relay connections), "both" - both. - simpleCollections: 'omit', -}) - -const server = createServer(pg) -server.listen(PORT, (err) => { - if (err) { - console.error(err) - process.exit(1) - } - const {port} = server.address() - console.info(`PostGraphile listening on port ${port}`) -}) diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 0adcfc7..3a55a9c 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -11,8 +11,7 @@ env | grep '^PG' || true unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 ls -lh amtrak-gtfs-2021-10-06 -psql -c 'create database amtrak_2021_10_06' -export PGDATABASE='amtrak_2021_10_06' +path_to_db="$(mktemp -d -t gtfs)/amtrak-gtfs-2021-10-06.duckdb" ../cli.js -d --trips-without-shape-id --schema amtrak \ --import-metadata \ @@ -20,8 +19,8 @@ export PGDATABASE='amtrak_2021_10_06' --stats-by-agency-route-stop-hour=view \ --stats-active-trips-by-hour=view \ --postgrest \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -33,26 +32,26 @@ EOF ) # 2021-11-26T15:15:00-05:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" | head -n 1) if [[ "$arr1" != "1637957700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2021-11-27T13:45:00-05:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" | tail -n 1) if [[ "$arrN" != "1638038700" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 fi -version=$(psql --csv -t -c "SELECT split_part(amtrak.gtfs_via_postgres_version(), '.', 1)" | tail -n 1) +version=$(duckdb -csv -noheader -c "SELECT split_part(amtrak.gtfs_via_postgres_version(), '.', 1)" | tail -n 1) if [[ "$version" != "4" ]]; then echo "invalid gtfs_via_postgres_version(): $version" 1>&2 exit 1 fi -fMin=$(psql --csv -t -c "SELECT amtrak.dates_filter_min('2021-11-27T13:45:00-06')" | tail -n 1) +fMin=$(duckdb -csv -noheader -c "SELECT amtrak.dates_filter_min('2021-11-27T13:45:00-06')" | tail -n 1) if [[ "$fMin" != "2021-11-24" ]]; then echo "invalid dates_filter_min(…): $fMin" 1>&2 exit 1 @@ -66,7 +65,7 @@ AND date = '2021-11-26' AND is_effective = True EOF ) -acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1) +acelaStat=$(duckdb -csv -noheader -c "$acelaStatQuery" | tail -n 1) if [[ "$acelaStat" != "16,190" ]]; then echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 exit 1 @@ -80,7 +79,7 @@ AND stop_id = 'PHL' -- Philadelphia AND effective_hour = '2022-07-24T09:00-05' EOF ) -acelaPhillyStat=$(psql --csv -t -c "$acelaPhillyStatQuery" | tail -n 1) +acelaPhillyStat=$(duckdb -csv -noheader -c "$acelaPhillyStatQuery" | tail -n 1) if [[ "$acelaPhillyStat" != "2" ]]; then echo "invalid stats for route 40751 (Acela) at PHL (Philadelphia) on 2021-11-26: $acelaPhillyStat" 1>&2 exit 1 @@ -98,44 +97,10 @@ EOF # FROM amtrak.connections # WHERE t_departure >= '2021-11-26T02:00-05' # AND t_arrival <= '2021-11-26T06:00-05' -nrOfActiveTrips=$(psql --csv -t -c "$nrOfActiveTripsQuery" | tail -n 1) +nrOfActiveTrips=$(duckdb -csv -noheader -c "$nrOfActiveTripsQuery" | tail -n 1) if [[ "$nrOfActiveTrips" != "127" ]]; then echo "unexpected no. of active trips at 2021-11-26T04:00-05: $nrOfActiveTrips" 1>&2 exit 1 fi -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -env \ - PGRST_DB_SCHEMAS=amtrak \ - PGRST_DB_ANON_ROLE=web_anon \ - PGRST_ADMIN_SERVER_PORT=3001 \ - PGRST_LOG_LEVEL=info \ - postgrest & - # docker run --rm -i \ - # -p 3000:3000 -p 3001:3001 \ - # -e PGHOST=host.docker.internal -e PGUSER -e PGPASSWORD -e PGDATABASE \ - # postgrest/postgrest & -sleep 3 - -health_status="$(curl 'http://localhost:3001/live' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$health_status" != '200' ]; then - 1>&2 echo "/live: expected 200, got $health_status" - exit 1 -fi - -stops_url='http://localhost:3000/stops?stop_name=ilike.%25palm%25&limit=1&order=stop_id.asc' -stops_status="$(curl "$stops_url" -H 'Accept: application/json' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$stops_status" != '200' ]; then - 1>&2 echo "$stops_url: expected 200, got $stops_status" - exit 1 -fi -stop_id="$(curl "$stops_url" -H 'Accept: application/json' -fsS | jq -rc '.[0].stop_id')" -if [ "$stop_id" != 'PDC' ]; then - 1>&2 echo "$stops_url: expected PDC, got $stop_id" - exit 1 -fi - echo 'works ✔' diff --git a/test/calendar-dates-only.sh b/test/calendar-dates-only.sh index 1014ad1..184be55 100755 --- a/test/calendar-dates-only.sh +++ b/test/calendar-dates-only.sh @@ -8,12 +8,11 @@ set -x env | grep '^PG' || true -psql -c 'create database calendar_dates_only' -export PGDATABASE='calendar_dates_only' +path_to_db="$(mktemp -d -t gtfs)/calendar-dates-only.duckdb" ../cli.js -d --trips-without-shape-id -- \ - calendar-dates-only/*.txt \ - | sponge | psql -b + "$path_to_db" \ + calendar-dates-only/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -24,14 +23,14 @@ EOF ) # 2019-07-15T15:30:00+02:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" | head -n 1) if [[ "$arr1" != "1563197400" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2019-07-20T15:30:00+02:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" | tail -n 1) if [[ "$arrN" != "1563629400" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 @@ -43,7 +42,7 @@ from arrivals_departures where agency_id IS NULL EOF ) -agency_id_null_count="$(psql --csv -t -c "$agency_id_null")" +agency_id_null_count="$(duckdb -csv -noheader -c "$agency_id_null")" if [[ "$agency_id_null_count" != "0" ]]; then echo ">0 rows with agency_id = null" 1>&2 exit 1 @@ -57,7 +56,7 @@ FROM arrivals_departures ORDER BY stop_id, trip_id EOF ) -wheelchair_boarding_rows="$(psql --csv -t -c "$wheelchair_boarding_query")" +wheelchair_boarding_rows="$(duckdb -csv -noheader -c "$wheelchair_boarding_query")" wheelchair_boarding_expected="$(echo -e "airport,accessible\nairport-1,not_accessible\nlake,no_info_or_inherit\nmuseum,no_info_or_inherit")" if [[ "$wheelchair_boarding_rows" != "$wheelchair_boarding_expected" ]]; then echo "invalid wheelchair_boarding values" 1>&2 diff --git a/test/index.sh b/test/index.sh index a40fa08..e0b24fd 100755 --- a/test/index.sh +++ b/test/index.sh @@ -6,12 +6,11 @@ set -o pipefail cd "$(dirname $0)" set -x -psql -t -c 'SELECT version()' +duckdb --version ./calendar-dates-only.sh ./sample-gtfs-feed.sh ./amtrak-gtfs-2021-10-06.sh -./postgraphile.sh ./routes-without-agency-id.sh ./stops-without-level-id.sh ./invalid-empty-agency-id.sh diff --git a/test/invalid-empty-agency-id.sh b/test/invalid-empty-agency-id.sh index c4dbba8..a180ccf 100755 --- a/test/invalid-empty-agency-id.sh +++ b/test/invalid-empty-agency-id.sh @@ -9,8 +9,8 @@ set -x # Refer to https://github.com/public-transport/gtfs-via-postgres/issues/45 for context. # The "core" bug: A feed without routes.agency_id should be importable. -if ../cli.js -d --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null; then +if ../cli.js -d --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt; then echo "import didn't fail" 1>&2 exit 1 else @@ -20,6 +20,6 @@ fi # A related bug: With --routes-without-agency-id, lib/deps.js *does not* specify routes to depend on agency. # *In some cases*, this causes agency to be processed *after* routes, causing the routes processing to fail. # see also https://github.com/public-transport/gtfs-via-postgres/issues/45#issuecomment-1632649826 -../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null +../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt echo 'did not fail even with --routes-without-agency-id ✔' diff --git a/test/multiple-schemas.sh b/test/multiple-schemas.sh index 7ddabbd..a5564d4 100755 --- a/test/multiple-schemas.sh +++ b/test/multiple-schemas.sh @@ -11,18 +11,17 @@ env | grep '^PG' || true unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 ls -lh amtrak-gtfs-2021-10-06 -psql -c 'create database multiple_schemas' -export PGDATABASE='multiple_schemas' +path_to_db="$(mktemp -d -t gtfs)/multiple-schemas.duckdb" ../cli.js -d --trips-without-shape-id \ --schema one \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt ../cli.js -d --trips-without-shape-id \ --schema two \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt # https://dba.stackexchange.com/a/72656 nr_of_unequal_stops=$(cat << EOF @@ -48,7 +47,7 @@ WHERE ( EOF ) -unequal_stops_1=$(psql --csv -t -c "$nr_of_unequal_stops" | head -n 1) +unequal_stops_1=$(duckdb -csv -noheader -c "$nr_of_unequal_stops" | head -n 1) if [[ "$unequal_stops_1" -ne 0 ]]; then 1>&2 echo "$unequal_stops_1 unequal stops between one.stops & two.stops" exit 1 @@ -57,7 +56,7 @@ fi # todo: assert that more tables are equal? # put an incompatible version -psql -c "$(cat << EOF +duckdb -c "$(cat << EOF CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() RETURNS TEXT AS \$\$ @@ -70,8 +69,8 @@ EOF # expect another import to fail if ../cli.js -d --trips-without-shape-id \ --schema three \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b; then + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt; then 1>&2 echo "re-import with incompatible version didn't fail" exit 1 fi diff --git a/test/postgraphile.sh b/test/postgraphile.sh deleted file mode 100755 index f06bce1..0000000 --- a/test/postgraphile.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -cd "$(dirname $0)" -set -x - -env | grep '^PG' || true - -psql -c 'create database postgraphile' -export PGDATABASE='postgraphile' - -../cli.js -d --trips-without-shape-id --postgraphile -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - | sponge | psql -b - -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -../scripts/run-postgraphile.js & -sleep 2 - -body=$(node -e 'process.stdout.write(JSON.stringify({query: fs.readFileSync("sample-gtfs-feed-postgraphile-test.graphql", {encoding: "utf8"})}))') -actual_path="$(mktemp -t sample-gtfs-feed-postgraphile-test-XXX)" -curl -X POST 'http://localhost:3000/graphql' -H 'Content-Type: application/json' -H 'Accept: application/json' --data "$body" -fsS | jq -r --tab . >"$actual_path" - -git diff --exit-code sample-gtfs-feed-postgraphile-test.res.json "$actual_path" - -echo 'works ✔' diff --git a/test/routes-without-agency-id.sh b/test/routes-without-agency-id.sh index 51e1530..c63e296 100755 --- a/test/routes-without-agency-id.sh +++ b/test/routes-without-agency-id.sh @@ -7,7 +7,7 @@ cd "$(dirname $0)" set -x ../cli.js -d --routes-without-agency-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/*.txt echo 'works ✔' diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index fac9c4e..b64c6b6 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -8,11 +8,12 @@ set -x env | grep '^PG' || true -psql -c 'create database sample_gtfs_feed' -export PGDATABASE='sample_gtfs_feed' +path_to_db="$(mktemp -d -t gtfs)/sample-gtfs-feed.duckdb" +# todo: what about sample-gtfs-feed@0.13? # --lower-case-lang-codes: Even though sample-gtfs-feed@0.11.2 *does not* contain invalid-case language codes (e.g. de_aT or de-at), we check that with --lower-case-lang-codes valid ones are still accepted. ../cli.js -d --trips-without-shape-id --lower-case-lang-codes -- \ + "$path_to_db" \ ../node_modules/sample-gtfs-feed/gtfs/agency.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar_dates.txt \ @@ -23,8 +24,7 @@ export PGDATABASE='sample_gtfs_feed' ../node_modules/sample-gtfs-feed/gtfs/stop_times.txt \ ../node_modules/sample-gtfs-feed/gtfs/levels.txt \ ../node_modules/sample-gtfs-feed/gtfs/pathways.txt \ - ../node_modules/sample-gtfs-feed/gtfs/translations.txt \ - | sponge | psql -b + ../node_modules/sample-gtfs-feed/gtfs/translations.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -34,13 +34,13 @@ order by t_arrival EOF ) -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" | head -n 1) if [[ "$arr1" != "1553993700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi -arr2=$(psql --csv -t -c "$query" | head -n 2 | tail -n 1) +arr2=$(duckdb -csv -noheader -c "$query" | head -n 2 | tail -n 1) if [[ "$arr2" != "1553994180" ]]; then echo "invalid 2nd t_arrival: $arr2" 1>&2 exit 1 @@ -58,12 +58,12 @@ arrs_deps_b_downtown_on_working_days=$(cat << EOF LIMIT 2 EOF ) -freq_arr_dep1=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 1) +freq_arr_dep1=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" | head -n 1) if [[ "$freq_arr_dep1" != "1,1552028340,1552028400,1,1" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep1" 1>&2 exit 1 fi -freq_arr_dep2=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 2 | tail -n 1) +freq_arr_dep2=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" | head -n 2 | tail -n 1) if [[ "$freq_arr_dep2" != "1,1552028640,1552028700,1,2" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep1" 1>&2 exit 1 @@ -81,7 +81,7 @@ cons_b_downtown_on_working_days=$(cat << EOF LIMIT 1 EOF ) -freq_con1=$(psql --csv -t -c "$cons_b_downtown_on_working_days") +freq_con1=$(duckdb -csv -noheader -c "$cons_b_downtown_on_working_days") if [[ "$freq_con1" != "1,1552028400,3,1552028760" ]]; then echo "invalid/missing frequencies-based connection: $freq_con1" 1>&2 exit 1 @@ -96,7 +96,7 @@ connection_during_dst=$(cat << EOF AND t_departure = '2019-03-31T01:58+01' EOF ) -dst1=$(psql --csv -t -c "$connection_during_dst" | head -n 1) +dst1=$(duckdb -csv -noheader -c "$connection_during_dst" | head -n 1) if [[ "$dst1" != "0,1553993880" ]]; then echo "invalid/missing DST t_departure: $dst1" 1>&2 exit 1 @@ -113,7 +113,7 @@ airport_levels=$(cat << EOF LIMIT 1 EOF ) -lvl1=$(psql --csv -t -c "$airport_levels" | head -n 1) +lvl1=$(duckdb -csv -noheader -c "$airport_levels" | head -n 1) if [[ "$lvl1" != "airport-level-0,0,ground level" ]]; then echo "invalid/missing lowest airport-% level: $lvl1" 1>&2 exit 1 @@ -129,7 +129,7 @@ airportPathway=$(cat << EOF LIMIT 1 EOF ) -pw1=$(psql --csv -t -c "$airportPathway" | head -n 1) +pw1=$(duckdb -csv -noheader -c "$airportPathway" | head -n 1) if [[ "$pw1" != "escalator,f" ]]; then echo "invalid/missing DST t_departure: $pw1" 1>&2 exit 1 @@ -143,7 +143,7 @@ timepoint_exact=$(cat << EOF LIMIT 1 EOF ) -exact1=$(psql --csv -t -c "$timepoint_exact" | head -n 1) +exact1=$(duckdb -csv -noheader -c "$timepoint_exact" | head -n 1) if [[ "$exact1" != "exact" ]]; then echo "invalid/missing DST t_departure: $exact1" 1>&2 exit 1 @@ -157,7 +157,7 @@ stops_translations=$(cat << EOF AND record_id = 'airport-entrance' EOF ) -airport_entrance_translation=$(psql --csv -t -c "$stops_translations") +airport_entrance_translation=$(duckdb -csv -noheader -c "$stops_translations") if [[ "$airport_entrance_translation" != "Eingang,de-DE" ]]; then echo "invalid/missing stop translation: $airport_entrance_translation" 1>&2 exit 1 @@ -173,7 +173,7 @@ stops_translated=$(cat << EOF AND stop_id = 'airport-entrance' EOF ) -translated_airport_entrance=$(psql --csv -t -c "$stops_translated") +translated_airport_entrance=$(duckdb -csv -noheader -c "$stops_translated") if [[ "$translated_airport_entrance" != "airport-entrance,Eingang,de-DE" ]]; then echo "invalid/missing translated stop: $translated_airport_entrance" 1>&2 exit 1 @@ -187,7 +187,7 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -wheelchair_accessible_arrs_deps_rows="$(psql --csv -t -c "$wheelchair_accessible_arrs_deps_query")" +wheelchair_accessible_arrs_deps_rows="$(duckdb -csv -noheader -c "$wheelchair_accessible_arrs_deps_query")" wheelchair_accessible_arrs_deps_expected=$(cat << EOF a-downtown-all-day, a-outbound-all-day, @@ -210,7 +210,7 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -bikes_allowed_arrs_deps_rows="$(psql --csv -t -c "$bikes_allowed_arrs_deps_query")" +bikes_allowed_arrs_deps_rows="$(duckdb -csv -noheader -c "$bikes_allowed_arrs_deps_query")" bikes_allowed_arrs_deps_expected=$(cat << EOF a-downtown-all-day, a-outbound-all-day, @@ -231,7 +231,7 @@ FROM arrivals_departures WHERE trip_id = 'b-downtown-on-working-days' AND "date" = '2019-05-29' AND frequencies_it = 3 EOF ) -frequencies_it_rows="$(psql --csv -t -c "$frequencies_it_query")" +frequencies_it_rows="$(duckdb -csv -noheader -c "$frequencies_it_query")" frequencies_it_expected=$(cat << EOF 2019-05-29 08:10:00+02,1,airport 2019-05-29 08:18:00+02,3,lake @@ -253,7 +253,7 @@ FROM stops_translated WHERE stop_id LIKE 'airport%' EOF ) -stops_translated_rows="$(psql --csv -t -c "$stops_translated_query")" +stops_translated_rows="$(duckdb -csv -noheader -c "$stops_translated_query")" stops_translated_expected=$(cat << EOF airport,International Airport (ABC),,train station at the Internationl Airport (ABC),,https://fta.example.org/stations/airport.html, airport-1,Gleis 1,de-DE,Platform 1,,, diff --git a/test/stops-without-level-id.sh b/test/stops-without-level-id.sh index 7431429..7a1edcb 100755 --- a/test/stops-without-level-id.sh +++ b/test/stops-without-level-id.sh @@ -11,13 +11,14 @@ shopt -s extglob # When omitting levels.txt, --stops-without-level-id/opt.stopsWithoutLevelId should be true by default. # see also https://github.com/public-transport/gtfs-via-postgres/issues/43 ../cli.js -d -s -- \ + ':memory:' \ ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ | grep -c 'stopsWithoutLevelId: true' # Importing should work *with* --stops-without-level-id (and without levels.txt). # see also https://github.com/public-transport/gtfs-via-postgres/issues/43#issuecomment-1632657546 ../cli.js -d -s --stops-without-level-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt echo 'works ✔'