From 3e7f29ddc0738d42ea94d6e9e478ae237d481634 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 12 Dec 2019 14:25:51 +0300 Subject: [PATCH 1/7] Add valgrind tests --- .circleci/config.yml | 61 +++++++++++++++++++++++ README.md | 86 +++++++++++++++++++++++++++++++++ azure/citus-bot.sh | 13 ++++- azure/create-cluster.sh | 17 ++++++- azure/finalize-valgrind-test.sh | 38 +++++++++++++++ azure/push-results.sh | 37 ++++++++++++++ azure/run-all-tests.sh | 39 +++++++-------- fabfile/config.py | 13 +++++ fabfile/run.py | 76 ++++++++++++++++++++++++++++- fabfile/setup.py | 22 +++++---- fabfile/use.py | 24 +++++++-- 11 files changed, 391 insertions(+), 35 deletions(-) create mode 100755 azure/finalize-valgrind-test.sh create mode 100755 azure/push-results.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 52090427..ce2bfdf6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -63,6 +63,40 @@ jobs: name: delete the given resource group no_output_timeout: 10m + valgrind-test: + docker: + - image: buildpack-deps:trusty + + working_directory: /home/circleci/project + steps: + - azure-cli/install + - azure-cli/login-with-service-principal + - checkout + - run: + command: | + cd ./azure + ./add-sshkey.sh + ./citus-bot.sh citusbot_valgrind_test_resource_group + name: install dependencies and run valgrind tests + no_output_timeout: 10m + + finalize-valgrind-test: + docker: + - image: buildpack-deps:trusty + + working_directory: /home/circleci/project + steps: + - azure-cli/install + - azure-cli/login-with-service-principal + - checkout + - run: + command: | + cd ./azure + ./add-sshkey.sh + ./finalize-valgrind-test.sh + name: install dependencies and run valgrind tests + no_output_timeout: 10m + orbs: azure-cli: circleci/azure-cli@1.0.0 @@ -97,3 +131,30 @@ workflows: only: - /tpch\/.*/ # match with tpch/ prefix - /all_performance_test\/.*/ # match with all_performance_test/ prefix + + # perform weekly valgrind test on azure every monday at 00:00 + # https://crontab.guru/#0_0_*_*_1 + weekly-valgrind: + triggers: + - schedule: + cron: "0 0 * * 1" + filters: + branches: + only: + - master + jobs: + - valgrind-test + + # Since valgrind tests really take a long time to finish, wait for 9.5 hours. + # Then push valgrind test results and terminate the machine. + # https://crontab.guru/#30_9_*_*_1 + weekly-valgrind-finalize: + triggers: + - schedule: + cron: "30 9 * * 1" + filters: + branches: + only: + - master + jobs: + - finalize-valgrind-test diff --git a/README.md b/README.md index cabc2a76..19dfea41 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ required for testing citus. * [Running PgBench Tests Against Hyperscale (Citus)](#pgbench-cloud) * [Running TPC-H Tests](#tpch) * [Running TPC-H Tests Against Hyperscale (Citus)](#tpch-cloud) + * [Running Valgrind Tests](#valgrind) * [Example fab Commands](#fab-examples) * [Tasks, and Ordering of Tasks](#fab-tasks) * [Task Namespaces](#task-namespaces) @@ -602,6 +603,91 @@ On the coordinator node: fab run.tpch_automate:tpch_q1.ini,connectionURI='postgres://citus:dwVg70yBfkZ6hO1WXFyq1Q@c.fhhwxh5watzbizj3folblgbnpbu.db.citusdata.com:5432/citus?sslmode\=require' ``` +## Running Valgrind Tests + +TL;DR + +```bash +# 1 # start valgrind test + +# create valgrind instance to run +eval `ssh-agent -s` +ssh-add +export GIT_USERNAME= +export GIT_TOKEN= # You can create a github token from https://github.com/settings/tokens. +export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here' +export VALGRIND_TEST=1 +cd azure +./create-cluster.sh + +# connect to coordinator +./connect.sh + +# run fab command in coordinator in a detachable session +sudo yum install tmux +tmux new -d "fab use.postgres:12.3 use.enterprise:enterprise-master run.valgrind" + +# simply exit from coordinator after detaching + +# 2 # finalize valgrind test + +# reconnect to coordinator after 9.5 hours (if you preferred default coordinator configuration) +export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here' +./connect.sh + +# you can first check if valgrind test is finished by attaching to tmux session +tmux a +# then you should detach from the session before moving forward +Ctrl+b d + +# run push results script +cd test-automation/azure +./push-results.sh + +# simply exit from coordinator after pushing the results + +# delete resource group finally +cd azure +./delete-resource-group.sh +``` + +DETAILS: + +To create a valgrind instance, following the steps in [Setup Steps For Each Test](#azure-setup-steps), do the following before executing `create-cluster.sh`: + +```bash +export VALGRIND_TEST=1 +``` + +, which makes `numberOfWorkers` setting useless. +This is because we will already be using our regression test structure and it creates a local cluster +itself. Also, as we install `valgrind` only on coordinator, if we have worker nodes, then we cannot build +PostgreSQL as we require `valgrind` on workers and get error even if we do not need them. + +On the coordinator node: + +```bash +# an example usage: Use PostgreSQL 12.1 and run valgrind test on enterprise/enterprise-master +fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind +``` + +However as valgrind tests take too much time to complete, we recommend you to run valgrind tests in a detached session: +```bash +sudo yum install tmux +tmux new -d "fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind" +``` + +After the tests are finished (takes up to 9 hours with default coordinator size), re-connect to the coordinator. +Result can be found under `$HOME/results` directory. + +To push the results to `release_test_results` repository, run the below command in coordinator node: + +```bash +sh $HOME/test-automation/azure/push-results.sh +``` + +Finally, delete your resource group. + ## Example fab Commands Use `fab --list` to see all the tasks you can run! This is just a few examples. diff --git a/azure/citus-bot.sh b/azure/citus-bot.sh index 09116889..ae4f473b 100755 --- a/azure/citus-bot.sh +++ b/azure/citus-bot.sh @@ -40,8 +40,19 @@ trap cleanup EXIT rg=$1 export RESOURCE_GROUP_NAME=${rg} -./create-cluster.sh +if [ "$rg" == "citusbot_valgrind_test_resource_group" ]; then + # If running valgrind tests, do not run cleanup function + # This is because, as valgrind tests requires too much time to run, + # we start valgrind tests via nohup in ci. Hence ssh session + # will immediately be closed just after the fabric command is run + trap - EXIT + # If running valgrind tests, export VALGRIND_TEST to be 1 to ensure + # only coordinator instance is created in create-cluster script + export VALGRIND_TEST=1 +fi + +./create-cluster.sh public_ip=$(az group deployment show -g ${rg} -n azuredeploy --query properties.outputs.publicIP.value) # remove the quotes diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh index 5b0b9302..d9d93721 100755 --- a/azure/create-cluster.sh +++ b/azure/create-cluster.sh @@ -34,7 +34,22 @@ echo "waiting a long time to create cluster, this might take up to 30 mins depen # so that $HOME, $PATH are set to the target users $HOME and $PATH. export BRANCH=${CIRCLE_BRANCH:=master} -az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH" +# below is the default create cluster command +CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH") + +# if VALGRIND_TEST variable is not exported, set it to 0 +is_valgrind_test=${VALGRIND_TEST:=0} + +# if we want to run valgrind tests, lets overwrite numberOfWorkers parameter with 0 +if [[ "$is_valgrind_test" == "1" ]]; then + # be on the safe side, add "--parameters" before "numberOfWorkers" as the order + # of the parameters in CREATE_CLUSTER_COMMAND may change + CREATE_CLUSTER_COMMAND+=(--parameters) + CREATE_CLUSTER_COMMAND+=(numberOfWorkers=0) +fi + +# run CREATE_CLUSTER_COMMAND +"${CREATE_CLUSTER_COMMAND[@]}" end_time=`date +%s` echo execution time was `expr $end_time - $start_time` s. diff --git a/azure/finalize-valgrind-test.sh b/azure/finalize-valgrind-test.sh new file mode 100755 index 00000000..cc49369f --- /dev/null +++ b/azure/finalize-valgrind-test.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# fail if trying to reference a variable that is not set. +set -u +# exit immediately if a command fails +set -e +# echo commands +set -x + +function cleanup { + sh ./delete-resource-group.sh +} + +export RESOURCE_GROUP_NAME="citusbot_valgrind_test_resource_group" + +trap cleanup EXIT + +public_ip=$(az group deployment show -g ${RESOURCE_GROUP_NAME} -n azuredeploy --query properties.outputs.publicIP.value) +# remove the quotes +public_ip=$(echo ${public_ip} | cut -d "\"" -f 2) + +echo ${public_ip} + +ssh-keyscan -H ${public_ip} >> ~/.ssh/known_hosts +chmod 600 ~/.ssh/known_hosts + +sh ./delete-security-rule.sh + +echo "adding public ip to known hosts in remote" +ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} "ssh-keyscan -H ${public_ip} >> /home/pguser/.ssh/known_hosts" +echo "running tests in remote" + +# ssh with non-interactive mode does not source bash profile, so we will need to do it ourselves here. +# put an empty success file for valgrind tests under results dir if there are error logs +# push the files under results dir +ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} \ +"source ~/.bash_profile;" \ +"sh /home/pguser/test-automation/azure/push-results.sh ${RESOURCE_GROUP_NAME}"; diff --git a/azure/push-results.sh b/azure/push-results.sh new file mode 100755 index 00000000..3f24781f --- /dev/null +++ b/azure/push-results.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# this scripts pushes the results under results/ directory to release-test-results repository + +# args # +# $1 -> branch name to push results + +# fail if trying to reference a variable that is not set. +set -u +# exit immediately if a command fails +set -e +# fail in a pipeline if any of the commands fails +set -o pipefail + +branch_name=$1 + +# add github to known hosts + +echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts + +git clone git@github.com:citusdata/release-test-results.git "${HOME}"/release-test-results + +git config --global user.email "citus-bot@microsoft.com" +git config --global user.name "citus bot" + +now=$(date +"%m_%d_%Y_%s") + +mv "${HOME}"/results "${HOME}"/release-test-results/periodic_job_results/"${now}" + +cd "${HOME}"/release-test-results + +commit_message="add test results" + +git checkout -b "${branch_name}/${now}" +git add -A +git commit -m "$commit_message" +git push origin "${branch_name}/${now}" diff --git a/azure/run-all-tests.sh b/azure/run-all-tests.sh index 3b8fe371..86fbbb7c 100755 --- a/azure/run-all-tests.sh +++ b/azure/run-all-tests.sh @@ -22,22 +22,23 @@ if [ "$rg_name" = "citusbot_tpch_test_resource_group" ]; then fab run.tpch_automate fi - -# add github to known hosts -echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts - -git clone git@github.com:citusdata/release-test-results.git - -git config --global user.email "citus-bot@microsoft.com" -git config --global user.name "citus bot" - -now=$(date +"%m_%d_%Y_%s") - -mv ${HOME}/results ${HOME}/release-test-results/periodic_job_results/${now} - -cd ${HOME}/release-test-results - -git checkout -b ${rg_name}/${now} -git add -A -git commit -m "add test results for performance tests ${rg_name}" -git push origin ${rg_name}/${now} +# If running valgrind tests, do not run cleanup function +# This is because, as valgrind tests requires too much time to run, +# we start valgrind tests via nohup in ci. Hence ssh session +# will immediately be closed just after the fabric command is run +# +# We have a seperate job to terminate the machine and push the results +if [ "$rg_name" = "citusbot_valgrind_test_resource_group" ]; then + nohup fab use.postgres:13.1 use.enterprise:enterprise-master run.valgrind > /dev/null 2>&1 & + + # wait for cloning to end + while ! test -d "$HOME/citus-enterprise"; + do + echo "Wait until citus is cloned completely ..."; + sleep 60; + done + + echo "Citus is cloned succesfully"; +else + sh "${HOME}"/test-automation/azure/push-results.sh "$1"; +fi diff --git a/fabfile/config.py b/fabfile/config.py index 0860da8d..f52a7f3a 100644 --- a/fabfile/config.py +++ b/fabfile/config.py @@ -14,6 +14,19 @@ RESULTS_DIRECTORY = os.path.join(HOME_DIR, 'results') CITUS_INSTALLATION = os.path.join(HOME_DIR, 'citus-installation') PORT = 5432 +RELATIVE_REGRESS_PATH = 'src/test/regress' + +# keys to access settings dictionary +REPO_PATH = 'repo_path' +BUILD_CITUS_FUNC = 'build_citus_func' + +# valgrind test variables +VALGRIND_TEST_OUT_FILE = 'valgrind_test_out.txt' +VALGRIND_LOGS_FILE = 'valgrind_test_log.txt' +REGRESSION_DIFFS_FILE = 'regression.diffs' +CITUS_RELATED_VALGRIND_LOG_FILE = 'valgrind_test_log_citus.txt' +VALGRIND_REQUIRED_PACKAGES = ['valgrind', 'valgrind-devel.x86_64', 'openssl-devel.x86_64', 'libicu-devel.x86_64'] +VALGRIND_SUCCESS_FNAME = 'valgrind_success' PG_VERSION = '9.6.1' PG_CONFIGURE_FLAGS = ['--with-openssl'] diff --git a/fabfile/run.py b/fabfile/run.py index 00d15ee6..add0a03f 100644 --- a/fabfile/run.py +++ b/fabfile/run.py @@ -1,4 +1,5 @@ -from fabric.api import task, run, cd, runs_once, roles, execute +from fabric.api import task, run, cd, runs_once, roles, execute, abort +from fabric.context_managers import settings import config import use @@ -12,7 +13,7 @@ import ConfigParser import time -__all__ = ['jdbc', 'regression', 'pgbench_tests', 'tpch_automate'] +__all__ = ['jdbc', 'regression', 'pgbench_tests', 'tpch_automate', 'valgrind', 'valgrind_filter_put_results'] @task @@ -205,3 +206,74 @@ def tpch_queries(query_info, connectionURI, pg_version, citus_version, config_fi out_val = run(run_string) results_file.write(out_val) results_file.write('\n') + +# If no citus valgrind logs exist results directory, then simply put valgrind_success +# file under results directory. +def valgrind_filter_put_results(): + 'Filter valgrind test outputs, put success file if no citus related valgrind output' + + repo_path = config.settings[config.REPO_PATH] + + regression_test_path = os.path.join(repo_path, config.RELATIVE_REGRESS_PATH) + + regression_diffs_path = os.path.join(regression_test_path, config.REGRESSION_DIFFS_FILE) + valgrind_logs_path = os.path.join(regression_test_path, config.VALGRIND_LOGS_FILE) + + citus_valgrind_logs_path = os.path.join(config.RESULTS_DIRECTORY, config.CITUS_RELATED_VALGRIND_LOG_FILE) + success_file_path = os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_SUCCESS_FNAME) + + trace_ids_tmp_file = ".trace_ids" + trace_ids_path = os.path.join(regression_test_path, trace_ids_tmp_file) + + # ship regression.diffs (if exists) to result folder + if os.path.isfile(regression_diffs_path): + run('mv {} {}'.format(regression_diffs_path, config.RESULTS_DIRECTORY)) + + # filter the (possibly) citus-related outputs and put to results file if existz + + if os.path.isfile(valgrind_logs_path): + + # get stack trace id that includes calls to citus + run('cat {} | grep -i "citus" | awk \'{{ print $1 }}\' | uniq > {}'.format(valgrind_logs_path, trace_ids_path)) + + if os.path.isfile(trace_ids_path) and os.path.getsize(trace_ids_path) > 0: + # filter stack traces with stack trace ids that we found above (if any) + run('while read line; do grep {} -e $line ; done < {} > {}'.format( + valgrind_logs_path, + trace_ids_path, + citus_valgrind_logs_path)) + + # cleanup + run('rm {}'.format(trace_ids_path)) + + # if we have no citus-related valgrind outputs then just put an empty file named as `config.VALGRIND_SUCCESS_FNAME` + if not os.path.exists(citus_valgrind_logs_path): + run('touch {}'.format(success_file_path)) + +@task +@roles('master') +def valgrind(*args): + 'Runs valgrind tests' + + # set citus path variable + repo_path = config.settings[config.REPO_PATH] + + use.valgrind() + setup.valgrind() + + with cd(os.path.join(repo_path, config.RELATIVE_REGRESS_PATH)): + + # make check-multi-vg returns 2 in case of failures in regression tests + # we should do failure handling here + with settings(warn_only=True): + valgrind_logs_path=os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_LOGS_FILE) + valgrind_test_out_path = os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_TEST_OUT_FILE) + + # wrap the command with tee to log stdout & stderr to a file in results directory as well + # this is done to ensure that valgrind test is actually finished + valgrind_test_command = 'make check-multi-vg valgrind-log-file={}'.format(valgrind_logs_path) + valgrind_test_command = valgrind_test_command + ' 2>&1 | tee {}'.format(valgrind_test_out_path) + + run(valgrind_test_command) + + valgrind_filter_put_results() diff --git a/fabfile/setup.py b/fabfile/setup.py index 434320e9..824c4601 100644 --- a/fabfile/setup.py +++ b/fabfile/setup.py @@ -22,6 +22,7 @@ import add import use import prefix +import use __all__ = ["basic_testing", "tpch", "valgrind", "enterprise", "hammerdb"] @@ -46,18 +47,21 @@ def tpch(): @task def valgrind(): - 'Just like basic_testing, but adds --enable-debug flag and installs valgrind' - execute(prefix.ensure_pg_latest_exists, default=config.CITUS_INSTALLATION) + # prepare yum install command + install_required_packages_command = 'yum install -q -y ' + ' '.join(config.VALGRIND_REQUIRED_PACKAGES) - # we do this execute dance so valgrind is installed on every node, not just the master - def install_valgrind(): - sudo('yum install -q -y valgrind') - execute(install_valgrind) + # install libraries required for valgrind test + sudo(install_required_packages_command) - config.PG_CONFIGURE_FLAGS.append('--enable-debug') + # create results directory to put resulting log files there + # (for pushing them to results repository) + utils.rmdir(config.RESULTS_DIRECTORY, force=True) + utils.mkdir_if_not_exists(config.RESULTS_DIRECTORY) - execute(common_setup, build_citus) - execute(add_workers) + # set build citus function + build_citus_func = config.settings[config.BUILD_CITUS_FUNC] + execute(prefix.ensure_pg_latest_exists, default=config.CITUS_INSTALLATION) + execute(common_setup, build_citus_func) @task @roles('master') diff --git a/fabfile/use.py b/fabfile/use.py index 29d28d8c..6ecc7438 100644 --- a/fabfile/use.py +++ b/fabfile/use.py @@ -5,7 +5,9 @@ ''' import re -from fabric.api import task, runs_once, abort, local, lcd, roles +from fabric.api import task, runs_once, abort, local, lcd, roles, sudo + +import setup import config import utils @@ -21,7 +23,12 @@ def citus(*args): abort('You must provide a single argument, with a command such as "use.citus:v6.0.1"') git_ref = args[0] - path = config.CITUS_REPO + # set community repo specific variables + config.settings[config.REPO_PATH] = config.CITUS_REPO + config.settings[config.BUILD_CITUS_FUNC] = setup.build_citus + + # check if we can clone citus successfully, then remove it + path = "/tmp/tmp_citus" local('rm -rf {} || true'.format(path)) local('git clone -q https://github.com/citusdata/citus.git {}'.format(path)) with lcd(path): @@ -41,7 +48,12 @@ def enterprise(*args): abort('You must provide a single argument, with a command such as "use.enterprise:v6.0.1"') git_ref = args[0] - path = config.ENTERPRISE_REPO + # set enterprise repo specific variables + config.settings[config.REPO_PATH] = config.ENTERPRISE_REPO + config.settings[config.BUILD_CITUS_FUNC] = setup.build_enterprise + + # check if we can clone citus successfully, then remove it + path = "/tmp/tmp_citus" local('rm -rf {} || true'.format(path)) if config.settings[config.IS_SSH_KEYS_USED]: local('git clone -q git@github.com:citusdata/citus-enterprise.git {}'.format(path)) @@ -79,3 +91,9 @@ def asserts(*args): def debug_mode(*args): '''ps's configure is passed: '--enable-debug --enable-cassert CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer"' ''' config.PG_CONFIGURE_FLAGS.append('--enable-debug --enable-cassert CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer"') + + +@task +def valgrind(*args): + config.PG_CONFIGURE_FLAGS.append('--with-icu --enable-cassert --enable-debug CFLAGS="-ggdb -Og -DUSE_VALGRIND"') + \ No newline at end of file From c1786a34ffa5de6561cad010ee64c21d892c38b1 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 3 Jul 2020 10:26:01 +0300 Subject: [PATCH 2/7] test on current branch (this will be reverted before merge) --- .circleci/config.yml | 4 ++-- azure/create-cluster.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ce2bfdf6..b8401089 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -141,7 +141,7 @@ workflows: filters: branches: only: - - master + - valgrind-merge jobs: - valgrind-test @@ -155,6 +155,6 @@ workflows: filters: branches: only: - - master + - valgrind-merge jobs: - finalize-valgrind-test diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh index d9d93721..ea6d9410 100755 --- a/azure/create-cluster.sh +++ b/azure/create-cluster.sh @@ -32,7 +32,7 @@ echo "waiting a long time to create cluster, this might take up to 30 mins depen # store the branch name in a file so that target user can read it. Target user cannot see the envionment variables because # we use login option in su and -p(preserving environment variables) cannot be used with login. We need to use login option # so that $HOME, $PATH are set to the target users $HOME and $PATH. -export BRANCH=${CIRCLE_BRANCH:=master} +export BRANCH="valgrind-merge" # below is the default create cluster command CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH") From 5377c9f2476995fa2138f44624c7ac359b66dbe7 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 1 Feb 2021 13:18:28 +0300 Subject: [PATCH 3/7] Revert "test on current branch (this will be reverted before merge)" This reverts commit c1786a34ffa5de6561cad010ee64c21d892c38b1. --- .circleci/config.yml | 4 ++-- azure/create-cluster.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b8401089..ce2bfdf6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -141,7 +141,7 @@ workflows: filters: branches: only: - - valgrind-merge + - master jobs: - valgrind-test @@ -155,6 +155,6 @@ workflows: filters: branches: only: - - valgrind-merge + - master jobs: - finalize-valgrind-test diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh index ea6d9410..d9d93721 100755 --- a/azure/create-cluster.sh +++ b/azure/create-cluster.sh @@ -32,7 +32,7 @@ echo "waiting a long time to create cluster, this might take up to 30 mins depen # store the branch name in a file so that target user can read it. Target user cannot see the envionment variables because # we use login option in su and -p(preserving environment variables) cannot be used with login. We need to use login option # so that $HOME, $PATH are set to the target users $HOME and $PATH. -export BRANCH="valgrind-merge" +export BRANCH=${CIRCLE_BRANCH:=master} # below is the default create cluster command CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH") From b8b605e93526998771d2f3ff5c139415cd11d06a Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 1 Feb 2021 13:21:23 +0300 Subject: [PATCH 4/7] fixup! Add valgrind tests as we didn't merge #203, fix readme section --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 19dfea41..87ac7df4 100644 --- a/README.md +++ b/README.md @@ -611,16 +611,14 @@ TL;DR # 1 # start valgrind test # create valgrind instance to run -eval `ssh-agent -s` -ssh-add -export GIT_USERNAME= -export GIT_TOKEN= # You can create a github token from https://github.com/settings/tokens. export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here' export VALGRIND_TEST=1 cd azure ./create-cluster.sh # connect to coordinator +eval `ssh-agent -s` +ssh-add ./connect.sh # run fab command in coordinator in a detachable session From 0c3d4efd41d50cef31f602e742c21c2c31cea6be Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 1 Feb 2021 13:37:33 +0300 Subject: [PATCH 5/7] fixup! Add valgrind tests check if not 0 instead --- azure/create-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh index d9d93721..c7b0a27e 100755 --- a/azure/create-cluster.sh +++ b/azure/create-cluster.sh @@ -41,7 +41,7 @@ CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azur is_valgrind_test=${VALGRIND_TEST:=0} # if we want to run valgrind tests, lets overwrite numberOfWorkers parameter with 0 -if [[ "$is_valgrind_test" == "1" ]]; then +if [[ "$is_valgrind_test" != "0" ]]; then # be on the safe side, add "--parameters" before "numberOfWorkers" as the order # of the parameters in CREATE_CLUSTER_COMMAND may change CREATE_CLUSTER_COMMAND+=(--parameters) From 18b7b1c06c9708460167296cdc3930a525199664 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 1 Feb 2021 13:46:21 +0300 Subject: [PATCH 6/7] fixup! Add valgrind tests better readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 87ac7df4..7605db92 100644 --- a/README.md +++ b/README.md @@ -685,6 +685,7 @@ sh $HOME/test-automation/azure/push-results.sh ``` Finally, delete your resource group. +Note that automated (weekly) valgrind test already destroys the resources that it uses. ## Example fab Commands From eb6fff05eef584802476c683dd379adcb30fe1cd Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 1 Feb 2021 14:50:26 +0300 Subject: [PATCH 7/7] fixup! Add valgrind tests final reviews --- .circleci/config.yml | 4 ++-- README.md | 2 -- azure/citus-bot.sh | 13 ++++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ce2bfdf6..c54aa9ba 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -133,10 +133,10 @@ workflows: - /all_performance_test\/.*/ # match with all_performance_test/ prefix # perform weekly valgrind test on azure every monday at 00:00 - # https://crontab.guru/#0_0_*_*_1 weekly-valgrind: triggers: - schedule: + # https://crontab.guru/#0_0_*_*_1 cron: "0 0 * * 1" filters: branches: @@ -147,10 +147,10 @@ workflows: # Since valgrind tests really take a long time to finish, wait for 9.5 hours. # Then push valgrind test results and terminate the machine. - # https://crontab.guru/#30_9_*_*_1 weekly-valgrind-finalize: triggers: - schedule: + # https://crontab.guru/#30_9_*_*_1 cron: "30 9 * * 1" filters: branches: diff --git a/README.md b/README.md index 7605db92..94843227 100644 --- a/README.md +++ b/README.md @@ -622,7 +622,6 @@ ssh-add ./connect.sh # run fab command in coordinator in a detachable session -sudo yum install tmux tmux new -d "fab use.postgres:12.3 use.enterprise:enterprise-master run.valgrind" # simply exit from coordinator after detaching @@ -671,7 +670,6 @@ fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind However as valgrind tests take too much time to complete, we recommend you to run valgrind tests in a detached session: ```bash -sudo yum install tmux tmux new -d "fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind" ``` diff --git a/azure/citus-bot.sh b/azure/citus-bot.sh index ae4f473b..7ba4960a 100755 --- a/azure/citus-bot.sh +++ b/azure/citus-bot.sh @@ -42,11 +42,6 @@ rg=$1 export RESOURCE_GROUP_NAME=${rg} if [ "$rg" == "citusbot_valgrind_test_resource_group" ]; then - # If running valgrind tests, do not run cleanup function - # This is because, as valgrind tests requires too much time to run, - # we start valgrind tests via nohup in ci. Hence ssh session - # will immediately be closed just after the fabric command is run - trap - EXIT # If running valgrind tests, export VALGRIND_TEST to be 1 to ensure # only coordinator instance is created in create-cluster script export VALGRIND_TEST=1 @@ -54,6 +49,14 @@ fi ./create-cluster.sh +if [ "$VALGRIND_TEST" == "1" ]; then + # If running valgrind tests, do not run cleanup function + # This is because, as valgrind tests requires too much time to run, + # we start valgrind tests via nohup in ci. Hence ssh session + # will immediately be closed just after the fabric command is run + trap - EXIT +fi + public_ip=$(az group deployment show -g ${rg} -n azuredeploy --query properties.outputs.publicIP.value) # remove the quotes public_ip=$(echo ${public_ip} | cut -d "\"" -f 2)