From 44a5be2073504cb0f40abb723159fbef58a43c48 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Tue, 3 Dec 2024 11:07:00 +0000 Subject: [PATCH] move to sql.clickhouse.com --- README.md | 6 +-- scripts/load_pypi_previous.sh | 16 +----- scripts/update_github.sh | 93 +++++++++++++++++++++++++++++++++++ src/utils/clickhouse.js | 5 +- 4 files changed, 102 insertions(+), 18 deletions(-) create mode 100644 scripts/update_github.sh diff --git a/README.md b/README.md index 575704a..bb6144e 100644 --- a/README.md +++ b/README.md @@ -252,14 +252,14 @@ We cover both options below. For users wishing to make changes to just the app and use the existing ClickHouse instance with the data, the following credentials can be used: ``` -host: https://clickpy-clickhouse.clickhouse.com +host: https://sql-clickhouse.clickhouse.com port: 443 user: play ``` Users can connect to this instance with the clickhouse-client and issue queries i.e. ```bash -clickhouse client -h clickpy-clickhouse.clickhouse.com --user play --secure +clickhouse client -h sql-clickhouse.clickhouse.com --user play --secure ``` See [App Configuration](#configuration). @@ -376,7 +376,7 @@ Copy the file `.env.example` to `.env.local`. Modify the settings with your clickhouse cluster details, e.g. if using the public instance. ``` -CLICKHOUSE_HOST=https://clickpy-clickhouse.clickhouse.com +CLICKHOUSE_HOST=https://sql-clickhouse.clickhouse.com CLICKHOUSE_USERNAME=play CLICKHOUSE_PASSWORD= PYPI_DATABASE=pypi diff --git a/scripts/load_pypi_previous.sh b/scripts/load_pypi_previous.sh index b5d2e9a..997e3a4 100644 --- a/scripts/load_pypi_previous.sh +++ b/scripts/load_pypi_previous.sh @@ -1,25 +1,13 @@ #!/bin/bash -# Get the current date in epoch format -current_date_epoch=$(date -u +%s) - -# Calculate the number of seconds in a day (86400 seconds) -seconds_in_a_day=86400 - -# Calculate the previous day's date in epoch format -previous_day_epoch=$((current_date_epoch - seconds_in_a_day)) - -# Calculate the epoch timestamp for midnight UTC -midnight_utc_epoch=$((previous_day_epoch / seconds_in_a_day * seconds_in_a_day)) - - CLICKHOUSE_USER=${CLICKHOUSE_USER:-default} CLICKHOUSE_HOST=${CLICKHOUSE_HOST:-localhost} CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-} +midnight_utc_epoch=$(clickhouse client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query "SELECT CAST(CAST(max(date) + toIntervalDay(1), 'DateTime'), 'Int64') FROM pypi.pypi_downloads_per_day") gsutil ls "gs://clickhouse-pypi/file_downloads/incremental/${midnight_utc_epoch}-*.parquet" | sed 's|gs://|https://storage.googleapis.com/|' > /opt/pypi/pypi-${midnight_utc_epoch}.txt echo "scheduling pypi load" -python queue_files.py --host ${CLICKHOUSE_HOST} --port 8443 --username ${CLICKHOUSE_USER} --password ${CLICKHOUSE_PASSWORD} --file "/opt/pypi/pypi-${midnight_utc_epoch}.txt" --task_database default --task_table tasks --files_chunk_size_min 500 --files_chunk_size_max 1000 +python3 queue_files.py --host ${CLICKHOUSE_HOST} --port 8443 --username ${CLICKHOUSE_USER} --password ${CLICKHOUSE_PASSWORD} --file "/opt/pypi/pypi-${midnight_utc_epoch}.txt" --task_database default --task_table tasks --files_chunk_size_min 500 --files_chunk_size_max 1000 diff --git a/scripts/update_github.sh b/scripts/update_github.sh new file mode 100644 index 0000000..56b2e3d --- /dev/null +++ b/scripts/update_github.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +mkdir -p gharchive + +CLICKHOUSE_USER=default +CLICKHOUSE_HOST=${CLICKHOUSE_HOST:-localhost} +CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-} + +if [ -z "$1" ]; then + echo "checking for latest date..." + min_date=$(clickhouse-client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query "SELECT max(file_time) FROM github.github_events"); + if [ "$min_date" == "1970-01-01 00:00:00" ]; then min_date=$(date -d '24 hour ago' '+%Y-%m-%d %H:00:00'); fi +else + min_date=$1; +fi + +if [ -z "$2" ]; then max_date=$(date '+%Y-%m-%d %H:00:00'); else max_date=$2; fi + +echo "min date: ${min_date}" +echo "max date: ${max_date}" + +echo "downloading files..." + +clickhouse-local --query "WITH (SELECT (parseDateTimeBestEffort('${max_date}') - (parseDateTimeBestEffort('${min_date}') + INTERVAL 1 HOUR))/3600) as hours SELECT toString(toDate( (parseDateTimeBestEffort('${min_date}') + INTERVAL 1 HOUR) + INTERVAL arrayJoin(range(0, ifNull(toUInt64(hours) + 1, 0))) HOUR as t)) || '-' || toString(toHour(t)) || '.json.gz'" | xargs -I{} bash -c "[ -f ../gharchive/{} ] || wget --continue 'https://data.gharchive.org/{}'" + +echo "inserting files..." + +find . -maxdepth 1 -name '*.json.gz' | xargs -P$(nproc) -I{} bash -c " +gzip -cd {} | jq -c ' +[ + (\"{}\" | scan(\"[0-9]+-[0-9]+-[0-9]+-[0-9]+\")), + .type, + .actor.login? // .actor_attributes.login? // (.actor | strings) // null, + .repo.name? // (.repository.owner? + \"/\" + .repository.name?) // null, + .repo.id, + .created_at, + .payload.updated_at? // .payload.comment?.updated_at? // .payload.issue?.updated_at? // .payload.pull_request?.updated_at? // null, + .payload.action, + .payload.comment.id, + .payload.review.body // .payload.comment.body // .payload.issue.body? // .payload.pull_request.body? // .payload.release.body? // null, + .payload.comment?.path? // null, + .payload.comment?.position? // null, + .payload.comment?.line? // null, + .payload.ref? // null, + .payload.ref_type? // null, + .payload.comment.user?.login? // .payload.issue.user?.login? // .payload.pull_request.user?.login? // null, + .payload.issue.number? // .payload.pull_request.number? // .payload.number? // null, + .payload.issue.title? // .payload.pull_request.title? // null, + [.payload.issue.labels?[]?.name // .payload.pull_request.labels?[]?.name], + .payload.issue.state? // .payload.pull_request.state? // null, + .payload.issue.locked? // .payload.pull_request.locked? // null, + .payload.issue.assignee?.login? // .payload.pull_request.assignee?.login? // null, + [.payload.issue.assignees?[]?.login? // .payload.pull_request.assignees?[]?.login?], + .payload.issue.comments? // .payload.pull_request.comments? // null, + .payload.review.author_association // .payload.issue.author_association? // .payload.pull_request.author_association? // null, + .payload.issue.closed_at? // .payload.pull_request.closed_at? // null, + .payload.pull_request.merged_at? // null, + .payload.pull_request.merge_commit_sha? // null, + [.payload.pull_request.requested_reviewers?[]?.login], + [.payload.pull_request.requested_teams?[]?.name], + .payload.pull_request.head?.ref? // null, + .payload.pull_request.head?.sha? // null, + .payload.pull_request.base?.ref? // null, + .payload.pull_request.base?.sha? // null, + .payload.pull_request.merged? // null, + .payload.pull_request.mergeable? // null, + .payload.pull_request.rebaseable? // null, + .payload.pull_request.mergeable_state? // null, + .payload.pull_request.merged_by?.login? // null, + .payload.pull_request.review_comments? // null, + .payload.pull_request.maintainer_can_modify? // null, + .payload.pull_request.commits? // null, + .payload.pull_request.additions? // null, + .payload.pull_request.deletions? // null, + .payload.pull_request.changed_files? // null, + .payload.comment.diff_hunk? // null, + .payload.comment.original_position? // null, + .payload.comment.commit_id? // null, + .payload.comment.original_commit_id? // null, + .payload.size? // null, + .payload.distinct_size? // null, + .payload.member.login? // .payload.member? // null, + .payload.release?.tag_name? // null, + .payload.release?.name? // null, + .payload.review?.state? // null +]' | clickhouse-client --input_format_null_as_default 1 --date_time_input_format best_effort --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query 'INSERT INTO github.github_events FORMAT JSONCompactEachRow' || echo 'File {} has issues' +" && mv *.json.gz ./gharchive + +echo "generating cron entry" + +current_dir=$(pwd) + +mv /opt/pypi/gharchive/*.gz /data/github/gharchive/ \ No newline at end of file diff --git a/src/utils/clickhouse.js b/src/utils/clickhouse.js index 759d3ee..68f9b9b 100644 --- a/src/utils/clickhouse.js +++ b/src/utils/clickhouse.js @@ -6,6 +6,9 @@ export const clickhouse = createClient({ host: process.env.CLICKHOUSE_HOST, username: process.env.CLICKHOUSE_USERNAME, password: process.env.CLICKHOUSE_PASSWORD, + clickhouse_settings: { + allow_experimental_analyzer: 0, + } }); export const web_clickhouse = createWebClient({ @@ -730,7 +733,7 @@ async function query(query_name, query, query_params) { .filter(([, value]) => value !== undefined) .map(([key, value]) => [`param_${key}`, Array.isArray(value) ? `['${value.join("','")}']` : value]) ); - query_link = `${query_link}&${Object.entries(prefixedParams).map(([name, value]) => `${encodeURIComponent(name)}=${encodeURIComponent(value)}`).join('&')}` + query_link = `${query_link}&tab=results&${Object.entries(prefixedParams).map(([name, value]) => `${encodeURIComponent(name)}=${encodeURIComponent(value)}`).join('&')}` } return Promise.all([Promise.resolve(query_link), results.json()]); }