diff --git a/.circleci/config.yml b/.circleci/config.yml
index 52090427..c54aa9ba 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -63,6 +63,40 @@ jobs:
name: delete the given resource group
no_output_timeout: 10m
+ valgrind-test:
+ docker:
+ - image: buildpack-deps:trusty
+
+ working_directory: /home/circleci/project
+ steps:
+ - azure-cli/install
+ - azure-cli/login-with-service-principal
+ - checkout
+ - run:
+ command: |
+ cd ./azure
+ ./add-sshkey.sh
+ ./citus-bot.sh citusbot_valgrind_test_resource_group
+ name: install dependencies and run valgrind tests
+ no_output_timeout: 10m
+
+ finalize-valgrind-test:
+ docker:
+ - image: buildpack-deps:trusty
+
+ working_directory: /home/circleci/project
+ steps:
+ - azure-cli/install
+ - azure-cli/login-with-service-principal
+ - checkout
+ - run:
+ command: |
+ cd ./azure
+ ./add-sshkey.sh
+ ./finalize-valgrind-test.sh
+ name: install dependencies and run valgrind tests
+ no_output_timeout: 10m
+
orbs:
azure-cli: circleci/azure-cli@1.0.0
@@ -97,3 +131,30 @@ workflows:
only:
- /tpch\/.*/ # match with tpch/ prefix
- /all_performance_test\/.*/ # match with all_performance_test/ prefix
+
+ # perform weekly valgrind test on azure every monday at 00:00
+ weekly-valgrind:
+ triggers:
+ - schedule:
+ # https://crontab.guru/#0_0_*_*_1
+ cron: "0 0 * * 1"
+ filters:
+ branches:
+ only:
+ - master
+ jobs:
+ - valgrind-test
+
+ # Since valgrind tests really take a long time to finish, wait for 9.5 hours.
+ # Then push valgrind test results and terminate the machine.
+ weekly-valgrind-finalize:
+ triggers:
+ - schedule:
+ # https://crontab.guru/#30_9_*_*_1
+ cron: "30 9 * * 1"
+ filters:
+ branches:
+ only:
+ - master
+ jobs:
+ - finalize-valgrind-test
diff --git a/README.md b/README.md
index cabc2a76..94843227 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ required for testing citus.
* [Running PgBench Tests Against Hyperscale (Citus)](#pgbench-cloud)
* [Running TPC-H Tests](#tpch)
* [Running TPC-H Tests Against Hyperscale (Citus)](#tpch-cloud)
+ * [Running Valgrind Tests](#valgrind)
* [Example fab Commands](#fab-examples)
* [Tasks, and Ordering of Tasks](#fab-tasks)
* [Task Namespaces](#task-namespaces)
@@ -602,6 +603,88 @@ On the coordinator node:
fab run.tpch_automate:tpch_q1.ini,connectionURI='postgres://citus:dwVg70yBfkZ6hO1WXFyq1Q@c.fhhwxh5watzbizj3folblgbnpbu.db.citusdata.com:5432/citus?sslmode\=require'
```
+## Running Valgrind Tests
+
+TL;DR
+
+```bash
+# 1 # start valgrind test
+
+# create valgrind instance to run
+export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here'
+export VALGRIND_TEST=1
+cd azure
+./create-cluster.sh
+
+# connect to coordinator
+eval `ssh-agent -s`
+ssh-add
+./connect.sh
+
+# run fab command in coordinator in a detachable session
+tmux new -d "fab use.postgres:12.3 use.enterprise:enterprise-master run.valgrind"
+
+# simply exit from coordinator after detaching
+
+# 2 # finalize valgrind test
+
+# reconnect to coordinator after 9.5 hours (if you preferred default coordinator configuration)
+export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here'
+./connect.sh
+
+# you can first check if valgrind test is finished by attaching to tmux session
+tmux a
+# then you should detach from the session before moving forward
+Ctrl+b d
+
+# run push results script
+cd test-automation/azure
+./push-results.sh
+
+# simply exit from coordinator after pushing the results
+
+# delete resource group finally
+cd azure
+./delete-resource-group.sh
+```
+
+DETAILS:
+
+To create a valgrind instance, following the steps in [Setup Steps For Each Test](#azure-setup-steps), do the following before executing `create-cluster.sh`:
+
+```bash
+export VALGRIND_TEST=1
+```
+
+, which makes `numberOfWorkers` setting useless.
+This is because we will already be using our regression test structure and it creates a local cluster
+itself. Also, as we install `valgrind` only on coordinator, if we have worker nodes, then we cannot build
+PostgreSQL as we require `valgrind` on workers and get error even if we do not need them.
+
+On the coordinator node:
+
+```bash
+# an example usage: Use PostgreSQL 12.1 and run valgrind test on enterprise/enterprise-master
+fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind
+```
+
+However as valgrind tests take too much time to complete, we recommend you to run valgrind tests in a detached session:
+```bash
+tmux new -d "fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind"
+```
+
+After the tests are finished (takes up to 9 hours with default coordinator size), re-connect to the coordinator.
+Result can be found under `$HOME/results` directory.
+
+To push the results to `release_test_results` repository, run the below command in coordinator node:
+
+```bash
+sh $HOME/test-automation/azure/push-results.sh
+```
+
+Finally, delete your resource group.
+Note that automated (weekly) valgrind test already destroys the resources that it uses.
+
## Example fab Commands
Use `fab --list` to see all the tasks you can run! This is just a few examples.
diff --git a/azure/citus-bot.sh b/azure/citus-bot.sh
index 09116889..7ba4960a 100755
--- a/azure/citus-bot.sh
+++ b/azure/citus-bot.sh
@@ -40,8 +40,22 @@ trap cleanup EXIT
rg=$1
export RESOURCE_GROUP_NAME=${rg}
+
+if [ "$rg" == "citusbot_valgrind_test_resource_group" ]; then
+ # If running valgrind tests, export VALGRIND_TEST to be 1 to ensure
+ # only coordinator instance is created in create-cluster script
+ export VALGRIND_TEST=1
+fi
+
./create-cluster.sh
+if [ "$VALGRIND_TEST" == "1" ]; then
+ # If running valgrind tests, do not run cleanup function
+ # This is because, as valgrind tests requires too much time to run,
+ # we start valgrind tests via nohup in ci. Hence ssh session
+ # will immediately be closed just after the fabric command is run
+ trap - EXIT
+fi
public_ip=$(az group deployment show -g ${rg} -n azuredeploy --query properties.outputs.publicIP.value)
# remove the quotes
diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh
index 5b0b9302..c7b0a27e 100755
--- a/azure/create-cluster.sh
+++ b/azure/create-cluster.sh
@@ -34,7 +34,22 @@ echo "waiting a long time to create cluster, this might take up to 30 mins depen
# so that $HOME, $PATH are set to the target users $HOME and $PATH.
export BRANCH=${CIRCLE_BRANCH:=master}
-az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH"
+# below is the default create cluster command
+CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH")
+
+# if VALGRIND_TEST variable is not exported, set it to 0
+is_valgrind_test=${VALGRIND_TEST:=0}
+
+# if we want to run valgrind tests, lets overwrite numberOfWorkers parameter with 0
+if [[ "$is_valgrind_test" != "0" ]]; then
+ # be on the safe side, add "--parameters" before "numberOfWorkers" as the order
+ # of the parameters in CREATE_CLUSTER_COMMAND may change
+ CREATE_CLUSTER_COMMAND+=(--parameters)
+ CREATE_CLUSTER_COMMAND+=(numberOfWorkers=0)
+fi
+
+# run CREATE_CLUSTER_COMMAND
+"${CREATE_CLUSTER_COMMAND[@]}"
end_time=`date +%s`
echo execution time was `expr $end_time - $start_time` s.
diff --git a/azure/finalize-valgrind-test.sh b/azure/finalize-valgrind-test.sh
new file mode 100755
index 00000000..cc49369f
--- /dev/null
+++ b/azure/finalize-valgrind-test.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# fail if trying to reference a variable that is not set.
+set -u
+# exit immediately if a command fails
+set -e
+# echo commands
+set -x
+
+function cleanup {
+ sh ./delete-resource-group.sh
+}
+
+export RESOURCE_GROUP_NAME="citusbot_valgrind_test_resource_group"
+
+trap cleanup EXIT
+
+public_ip=$(az group deployment show -g ${RESOURCE_GROUP_NAME} -n azuredeploy --query properties.outputs.publicIP.value)
+# remove the quotes
+public_ip=$(echo ${public_ip} | cut -d "\"" -f 2)
+
+echo ${public_ip}
+
+ssh-keyscan -H ${public_ip} >> ~/.ssh/known_hosts
+chmod 600 ~/.ssh/known_hosts
+
+sh ./delete-security-rule.sh
+
+echo "adding public ip to known hosts in remote"
+ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} "ssh-keyscan -H ${public_ip} >> /home/pguser/.ssh/known_hosts"
+echo "running tests in remote"
+
+# ssh with non-interactive mode does not source bash profile, so we will need to do it ourselves here.
+# put an empty success file for valgrind tests under results dir if there are error logs
+# push the files under results dir
+ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} \
+"source ~/.bash_profile;" \
+"sh /home/pguser/test-automation/azure/push-results.sh ${RESOURCE_GROUP_NAME}";
diff --git a/azure/push-results.sh b/azure/push-results.sh
new file mode 100755
index 00000000..3f24781f
--- /dev/null
+++ b/azure/push-results.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# this scripts pushes the results under results/ directory to release-test-results repository
+
+# args #
+# $1 -> branch name to push results
+
+# fail if trying to reference a variable that is not set.
+set -u
+# exit immediately if a command fails
+set -e
+# fail in a pipeline if any of the commands fails
+set -o pipefail
+
+branch_name=$1
+
+# add github to known hosts
+
+echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts
+
+git clone git@github.com:citusdata/release-test-results.git "${HOME}"/release-test-results
+
+git config --global user.email "citus-bot@microsoft.com"
+git config --global user.name "citus bot"
+
+now=$(date +"%m_%d_%Y_%s")
+
+mv "${HOME}"/results "${HOME}"/release-test-results/periodic_job_results/"${now}"
+
+cd "${HOME}"/release-test-results
+
+commit_message="add test results"
+
+git checkout -b "${branch_name}/${now}"
+git add -A
+git commit -m "$commit_message"
+git push origin "${branch_name}/${now}"
diff --git a/azure/run-all-tests.sh b/azure/run-all-tests.sh
index 3b8fe371..86fbbb7c 100755
--- a/azure/run-all-tests.sh
+++ b/azure/run-all-tests.sh
@@ -22,22 +22,23 @@ if [ "$rg_name" = "citusbot_tpch_test_resource_group" ]; then
fab run.tpch_automate
fi
-
-# add github to known hosts
-echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts
-
-git clone git@github.com:citusdata/release-test-results.git
-
-git config --global user.email "citus-bot@microsoft.com"
-git config --global user.name "citus bot"
-
-now=$(date +"%m_%d_%Y_%s")
-
-mv ${HOME}/results ${HOME}/release-test-results/periodic_job_results/${now}
-
-cd ${HOME}/release-test-results
-
-git checkout -b ${rg_name}/${now}
-git add -A
-git commit -m "add test results for performance tests ${rg_name}"
-git push origin ${rg_name}/${now}
+# If running valgrind tests, do not run cleanup function
+# This is because, as valgrind tests requires too much time to run,
+# we start valgrind tests via nohup in ci. Hence ssh session
+# will immediately be closed just after the fabric command is run
+#
+# We have a seperate job to terminate the machine and push the results
+if [ "$rg_name" = "citusbot_valgrind_test_resource_group" ]; then
+ nohup fab use.postgres:13.1 use.enterprise:enterprise-master run.valgrind > /dev/null 2>&1 &
+
+ # wait for cloning to end
+ while ! test -d "$HOME/citus-enterprise";
+ do
+ echo "Wait until citus is cloned completely ...";
+ sleep 60;
+ done
+
+ echo "Citus is cloned succesfully";
+else
+ sh "${HOME}"/test-automation/azure/push-results.sh "$1";
+fi
diff --git a/fabfile/config.py b/fabfile/config.py
index 0860da8d..f52a7f3a 100644
--- a/fabfile/config.py
+++ b/fabfile/config.py
@@ -14,6 +14,19 @@
RESULTS_DIRECTORY = os.path.join(HOME_DIR, 'results')
CITUS_INSTALLATION = os.path.join(HOME_DIR, 'citus-installation')
PORT = 5432
+RELATIVE_REGRESS_PATH = 'src/test/regress'
+
+# keys to access settings dictionary
+REPO_PATH = 'repo_path'
+BUILD_CITUS_FUNC = 'build_citus_func'
+
+# valgrind test variables
+VALGRIND_TEST_OUT_FILE = 'valgrind_test_out.txt'
+VALGRIND_LOGS_FILE = 'valgrind_test_log.txt'
+REGRESSION_DIFFS_FILE = 'regression.diffs'
+CITUS_RELATED_VALGRIND_LOG_FILE = 'valgrind_test_log_citus.txt'
+VALGRIND_REQUIRED_PACKAGES = ['valgrind', 'valgrind-devel.x86_64', 'openssl-devel.x86_64', 'libicu-devel.x86_64']
+VALGRIND_SUCCESS_FNAME = 'valgrind_success'
PG_VERSION = '9.6.1'
PG_CONFIGURE_FLAGS = ['--with-openssl']
diff --git a/fabfile/run.py b/fabfile/run.py
index 00d15ee6..add0a03f 100644
--- a/fabfile/run.py
+++ b/fabfile/run.py
@@ -1,4 +1,5 @@
-from fabric.api import task, run, cd, runs_once, roles, execute
+from fabric.api import task, run, cd, runs_once, roles, execute, abort
+from fabric.context_managers import settings
import config
import use
@@ -12,7 +13,7 @@
import ConfigParser
import time
-__all__ = ['jdbc', 'regression', 'pgbench_tests', 'tpch_automate']
+__all__ = ['jdbc', 'regression', 'pgbench_tests', 'tpch_automate', 'valgrind', 'valgrind_filter_put_results']
@task
@@ -205,3 +206,74 @@ def tpch_queries(query_info, connectionURI, pg_version, citus_version, config_fi
out_val = run(run_string)
results_file.write(out_val)
results_file.write('\n')
+
+# If no citus valgrind logs exist results directory, then simply put valgrind_success
+# file under results directory.
+def valgrind_filter_put_results():
+ 'Filter valgrind test outputs, put success file if no citus related valgrind output'
+
+ repo_path = config.settings[config.REPO_PATH]
+
+ regression_test_path = os.path.join(repo_path, config.RELATIVE_REGRESS_PATH)
+
+ regression_diffs_path = os.path.join(regression_test_path, config.REGRESSION_DIFFS_FILE)
+ valgrind_logs_path = os.path.join(regression_test_path, config.VALGRIND_LOGS_FILE)
+
+ citus_valgrind_logs_path = os.path.join(config.RESULTS_DIRECTORY, config.CITUS_RELATED_VALGRIND_LOG_FILE)
+ success_file_path = os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_SUCCESS_FNAME)
+
+ trace_ids_tmp_file = ".trace_ids"
+ trace_ids_path = os.path.join(regression_test_path, trace_ids_tmp_file)
+
+ # ship regression.diffs (if exists) to result folder
+ if os.path.isfile(regression_diffs_path):
+ run('mv {} {}'.format(regression_diffs_path, config.RESULTS_DIRECTORY))
+
+ # filter the (possibly) citus-related outputs and put to results file if existz
+
+ if os.path.isfile(valgrind_logs_path):
+
+ # get stack trace id that includes calls to citus
+ run('cat {} | grep -i "citus" | awk \'{{ print $1 }}\' | uniq > {}'.format(valgrind_logs_path, trace_ids_path))
+
+ if os.path.isfile(trace_ids_path) and os.path.getsize(trace_ids_path) > 0:
+ # filter stack traces with stack trace ids that we found above (if any)
+ run('while read line; do grep {} -e $line ; done < {} > {}'.format(
+ valgrind_logs_path,
+ trace_ids_path,
+ citus_valgrind_logs_path))
+
+ # cleanup
+ run('rm {}'.format(trace_ids_path))
+
+ # if we have no citus-related valgrind outputs then just put an empty file named as `config.VALGRIND_SUCCESS_FNAME`
+ if not os.path.exists(citus_valgrind_logs_path):
+ run('touch {}'.format(success_file_path))
+
+@task
+@roles('master')
+def valgrind(*args):
+ 'Runs valgrind tests'
+
+ # set citus path variable
+ repo_path = config.settings[config.REPO_PATH]
+
+ use.valgrind()
+ setup.valgrind()
+
+ with cd(os.path.join(repo_path, config.RELATIVE_REGRESS_PATH)):
+
+ # make check-multi-vg returns 2 in case of failures in regression tests
+ # we should do failure handling here
+ with settings(warn_only=True):
+ valgrind_logs_path=os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_LOGS_FILE)
+ valgrind_test_out_path = os.path.join(config.RESULTS_DIRECTORY, config.VALGRIND_TEST_OUT_FILE)
+
+ # wrap the command with tee to log stdout & stderr to a file in results directory as well
+ # this is done to ensure that valgrind test is actually finished
+ valgrind_test_command = 'make check-multi-vg valgrind-log-file={}'.format(valgrind_logs_path)
+ valgrind_test_command = valgrind_test_command + ' 2>&1 | tee {}'.format(valgrind_test_out_path)
+
+ run(valgrind_test_command)
+
+ valgrind_filter_put_results()
diff --git a/fabfile/setup.py b/fabfile/setup.py
index 434320e9..824c4601 100644
--- a/fabfile/setup.py
+++ b/fabfile/setup.py
@@ -22,6 +22,7 @@
import add
import use
import prefix
+import use
__all__ = ["basic_testing", "tpch", "valgrind", "enterprise", "hammerdb"]
@@ -46,18 +47,21 @@ def tpch():
@task
def valgrind():
- 'Just like basic_testing, but adds --enable-debug flag and installs valgrind'
- execute(prefix.ensure_pg_latest_exists, default=config.CITUS_INSTALLATION)
+ # prepare yum install command
+ install_required_packages_command = 'yum install -q -y ' + ' '.join(config.VALGRIND_REQUIRED_PACKAGES)
- # we do this execute dance so valgrind is installed on every node, not just the master
- def install_valgrind():
- sudo('yum install -q -y valgrind')
- execute(install_valgrind)
+ # install libraries required for valgrind test
+ sudo(install_required_packages_command)
- config.PG_CONFIGURE_FLAGS.append('--enable-debug')
+ # create results directory to put resulting log files there
+ # (for pushing them to results repository)
+ utils.rmdir(config.RESULTS_DIRECTORY, force=True)
+ utils.mkdir_if_not_exists(config.RESULTS_DIRECTORY)
- execute(common_setup, build_citus)
- execute(add_workers)
+ # set build citus function
+ build_citus_func = config.settings[config.BUILD_CITUS_FUNC]
+ execute(prefix.ensure_pg_latest_exists, default=config.CITUS_INSTALLATION)
+ execute(common_setup, build_citus_func)
@task
@roles('master')
diff --git a/fabfile/use.py b/fabfile/use.py
index 29d28d8c..6ecc7438 100644
--- a/fabfile/use.py
+++ b/fabfile/use.py
@@ -5,7 +5,9 @@
'''
import re
-from fabric.api import task, runs_once, abort, local, lcd, roles
+from fabric.api import task, runs_once, abort, local, lcd, roles, sudo
+
+import setup
import config
import utils
@@ -21,7 +23,12 @@ def citus(*args):
abort('You must provide a single argument, with a command such as "use.citus:v6.0.1"')
git_ref = args[0]
- path = config.CITUS_REPO
+ # set community repo specific variables
+ config.settings[config.REPO_PATH] = config.CITUS_REPO
+ config.settings[config.BUILD_CITUS_FUNC] = setup.build_citus
+
+ # check if we can clone citus successfully, then remove it
+ path = "/tmp/tmp_citus"
local('rm -rf {} || true'.format(path))
local('git clone -q https://github.com/citusdata/citus.git {}'.format(path))
with lcd(path):
@@ -41,7 +48,12 @@ def enterprise(*args):
abort('You must provide a single argument, with a command such as "use.enterprise:v6.0.1"')
git_ref = args[0]
- path = config.ENTERPRISE_REPO
+ # set enterprise repo specific variables
+ config.settings[config.REPO_PATH] = config.ENTERPRISE_REPO
+ config.settings[config.BUILD_CITUS_FUNC] = setup.build_enterprise
+
+ # check if we can clone citus successfully, then remove it
+ path = "/tmp/tmp_citus"
local('rm -rf {} || true'.format(path))
if config.settings[config.IS_SSH_KEYS_USED]:
local('git clone -q git@github.com:citusdata/citus-enterprise.git {}'.format(path))
@@ -79,3 +91,9 @@ def asserts(*args):
def debug_mode(*args):
'''ps's configure is passed: '--enable-debug --enable-cassert CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer"' '''
config.PG_CONFIGURE_FLAGS.append('--enable-debug --enable-cassert CFLAGS="-ggdb -Og -g3 -fno-omit-frame-pointer"')
+
+
+@task
+def valgrind(*args):
+ config.PG_CONFIGURE_FLAGS.append('--with-icu --enable-cassert --enable-debug CFLAGS="-ggdb -Og -DUSE_VALGRIND"')
+
\ No newline at end of file