Skip to content

Commit

Permalink
Merge pull request #108 from ClickHouse/move_to_sql
Browse files Browse the repository at this point in the history
move to sql.clickhouse.com
  • Loading branch information
gingerwizard authored Dec 3, 2024
2 parents 7a41195 + 44a5be2 commit c1a108f
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 18 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,14 @@ We cover both options below.
For users wishing to make changes to just the app and use the existing ClickHouse instance with the data, the following credentials can be used:

```
host: https://clickpy-clickhouse.clickhouse.com
host: https://sql-clickhouse.clickhouse.com
port: 443
user: play
```
Users can connect to this instance with the clickhouse-client and issue queries i.e.

```bash
clickhouse client -h clickpy-clickhouse.clickhouse.com --user play --secure
clickhouse client -h sql-clickhouse.clickhouse.com --user play --secure
```

See [App Configuration](#configuration).
Expand Down Expand Up @@ -376,7 +376,7 @@ Copy the file `.env.example` to `.env.local`.
Modify the settings with your clickhouse cluster details, e.g. if using the public instance.

```
CLICKHOUSE_HOST=https://clickpy-clickhouse.clickhouse.com
CLICKHOUSE_HOST=https://sql-clickhouse.clickhouse.com
CLICKHOUSE_USERNAME=play
CLICKHOUSE_PASSWORD=
PYPI_DATABASE=pypi
Expand Down
16 changes: 2 additions & 14 deletions scripts/load_pypi_previous.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
#!/bin/bash

# Get the current date in epoch format
current_date_epoch=$(date -u +%s)

# Calculate the number of seconds in a day (86400 seconds)
seconds_in_a_day=86400

# Calculate the previous day's date in epoch format
previous_day_epoch=$((current_date_epoch - seconds_in_a_day))

# Calculate the epoch timestamp for midnight UTC
midnight_utc_epoch=$((previous_day_epoch / seconds_in_a_day * seconds_in_a_day))


CLICKHOUSE_USER=${CLICKHOUSE_USER:-default}
CLICKHOUSE_HOST=${CLICKHOUSE_HOST:-localhost}
CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-}

midnight_utc_epoch=$(clickhouse client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query "SELECT CAST(CAST(max(date) + toIntervalDay(1), 'DateTime'), 'Int64') FROM pypi.pypi_downloads_per_day")

gsutil ls "gs://clickhouse-pypi/file_downloads/incremental/${midnight_utc_epoch}-*.parquet" | sed 's|gs://|https://storage.googleapis.com/|' > /opt/pypi/pypi-${midnight_utc_epoch}.txt

echo "scheduling pypi load"

python queue_files.py --host ${CLICKHOUSE_HOST} --port 8443 --username ${CLICKHOUSE_USER} --password ${CLICKHOUSE_PASSWORD} --file "/opt/pypi/pypi-${midnight_utc_epoch}.txt" --task_database default --task_table tasks --files_chunk_size_min 500 --files_chunk_size_max 1000
python3 queue_files.py --host ${CLICKHOUSE_HOST} --port 8443 --username ${CLICKHOUSE_USER} --password ${CLICKHOUSE_PASSWORD} --file "/opt/pypi/pypi-${midnight_utc_epoch}.txt" --task_database default --task_table tasks --files_chunk_size_min 500 --files_chunk_size_max 1000
93 changes: 93 additions & 0 deletions scripts/update_github.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/bin/bash

mkdir -p gharchive

CLICKHOUSE_USER=default
CLICKHOUSE_HOST=${CLICKHOUSE_HOST:-localhost}
CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-}

if [ -z "$1" ]; then
echo "checking for latest date..."
min_date=$(clickhouse-client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query "SELECT max(file_time) FROM github.github_events");
if [ "$min_date" == "1970-01-01 00:00:00" ]; then min_date=$(date -d '24 hour ago' '+%Y-%m-%d %H:00:00'); fi
else
min_date=$1;
fi

if [ -z "$2" ]; then max_date=$(date '+%Y-%m-%d %H:00:00'); else max_date=$2; fi

echo "min date: ${min_date}"
echo "max date: ${max_date}"

echo "downloading files..."

clickhouse-local --query "WITH (SELECT (parseDateTimeBestEffort('${max_date}') - (parseDateTimeBestEffort('${min_date}') + INTERVAL 1 HOUR))/3600) as hours SELECT toString(toDate( (parseDateTimeBestEffort('${min_date}') + INTERVAL 1 HOUR) + INTERVAL arrayJoin(range(0, ifNull(toUInt64(hours) + 1, 0))) HOUR as t)) || '-' || toString(toHour(t)) || '.json.gz'" | xargs -I{} bash -c "[ -f ../gharchive/{} ] || wget --continue 'https://data.gharchive.org/{}'"

echo "inserting files..."

find . -maxdepth 1 -name '*.json.gz' | xargs -P$(nproc) -I{} bash -c "
gzip -cd {} | jq -c '
[
(\"{}\" | scan(\"[0-9]+-[0-9]+-[0-9]+-[0-9]+\")),
.type,
.actor.login? // .actor_attributes.login? // (.actor | strings) // null,
.repo.name? // (.repository.owner? + \"/\" + .repository.name?) // null,
.repo.id,
.created_at,
.payload.updated_at? // .payload.comment?.updated_at? // .payload.issue?.updated_at? // .payload.pull_request?.updated_at? // null,
.payload.action,
.payload.comment.id,
.payload.review.body // .payload.comment.body // .payload.issue.body? // .payload.pull_request.body? // .payload.release.body? // null,
.payload.comment?.path? // null,
.payload.comment?.position? // null,
.payload.comment?.line? // null,
.payload.ref? // null,
.payload.ref_type? // null,
.payload.comment.user?.login? // .payload.issue.user?.login? // .payload.pull_request.user?.login? // null,
.payload.issue.number? // .payload.pull_request.number? // .payload.number? // null,
.payload.issue.title? // .payload.pull_request.title? // null,
[.payload.issue.labels?[]?.name // .payload.pull_request.labels?[]?.name],
.payload.issue.state? // .payload.pull_request.state? // null,
.payload.issue.locked? // .payload.pull_request.locked? // null,
.payload.issue.assignee?.login? // .payload.pull_request.assignee?.login? // null,
[.payload.issue.assignees?[]?.login? // .payload.pull_request.assignees?[]?.login?],
.payload.issue.comments? // .payload.pull_request.comments? // null,
.payload.review.author_association // .payload.issue.author_association? // .payload.pull_request.author_association? // null,
.payload.issue.closed_at? // .payload.pull_request.closed_at? // null,
.payload.pull_request.merged_at? // null,
.payload.pull_request.merge_commit_sha? // null,
[.payload.pull_request.requested_reviewers?[]?.login],
[.payload.pull_request.requested_teams?[]?.name],
.payload.pull_request.head?.ref? // null,
.payload.pull_request.head?.sha? // null,
.payload.pull_request.base?.ref? // null,
.payload.pull_request.base?.sha? // null,
.payload.pull_request.merged? // null,
.payload.pull_request.mergeable? // null,
.payload.pull_request.rebaseable? // null,
.payload.pull_request.mergeable_state? // null,
.payload.pull_request.merged_by?.login? // null,
.payload.pull_request.review_comments? // null,
.payload.pull_request.maintainer_can_modify? // null,
.payload.pull_request.commits? // null,
.payload.pull_request.additions? // null,
.payload.pull_request.deletions? // null,
.payload.pull_request.changed_files? // null,
.payload.comment.diff_hunk? // null,
.payload.comment.original_position? // null,
.payload.comment.commit_id? // null,
.payload.comment.original_commit_id? // null,
.payload.size? // null,
.payload.distinct_size? // null,
.payload.member.login? // .payload.member? // null,
.payload.release?.tag_name? // null,
.payload.release?.name? // null,
.payload.review?.state? // null
]' | clickhouse-client --input_format_null_as_default 1 --date_time_input_format best_effort --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --query 'INSERT INTO github.github_events FORMAT JSONCompactEachRow' || echo 'File {} has issues'
" && mv *.json.gz ./gharchive

echo "generating cron entry"

current_dir=$(pwd)

mv /opt/pypi/gharchive/*.gz /data/github/gharchive/
5 changes: 4 additions & 1 deletion src/utils/clickhouse.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ export const clickhouse = createClient({
host: process.env.CLICKHOUSE_HOST,
username: process.env.CLICKHOUSE_USERNAME,
password: process.env.CLICKHOUSE_PASSWORD,
clickhouse_settings: {
allow_experimental_analyzer: 0,
}
});

export const web_clickhouse = createWebClient({
Expand Down Expand Up @@ -730,7 +733,7 @@ async function query(query_name, query, query_params) {
.filter(([, value]) => value !== undefined)
.map(([key, value]) => [`param_${key}`, Array.isArray(value) ? `['${value.join("','")}']` : value])
);
query_link = `${query_link}&${Object.entries(prefixedParams).map(([name, value]) => `${encodeURIComponent(name)}=${encodeURIComponent(value)}`).join('&')}`
query_link = `${query_link}&tab=results&${Object.entries(prefixedParams).map(([name, value]) => `${encodeURIComponent(name)}=${encodeURIComponent(value)}`).join('&')}`
}
return Promise.all([Promise.resolve(query_link), results.json()]);
}

0 comments on commit c1a108f

Please sign in to comment.