Add valgrind tests (#188)

citusdata · Feb 1, 2021 · 83dbf70 · 83dbf70
1 parent d2397e6
commit 83dbf70
Show file tree

Hide file tree

Showing 11 changed files with 390 additions and 34 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -63,6 +63,40 @@ jobs:
           name: delete the given resource group
           no_output_timeout: 10m             
 
+  valgrind-test:
+    docker:
+      - image: buildpack-deps:trusty
+
+    working_directory: /home/circleci/project  
+    steps:
+      - azure-cli/install
+      - azure-cli/login-with-service-principal
+      - checkout
+      - run:
+          command: |
+            cd ./azure
+            ./add-sshkey.sh
+            ./citus-bot.sh citusbot_valgrind_test_resource_group
+          name: install dependencies and run valgrind tests
+          no_output_timeout: 10m 
+
+  finalize-valgrind-test:
+    docker:
+      - image: buildpack-deps:trusty
+
+    working_directory: /home/circleci/project  
+    steps:
+      - azure-cli/install
+      - azure-cli/login-with-service-principal
+      - checkout
+      - run:
+          command: |
+            cd ./azure
+            ./add-sshkey.sh
+            ./finalize-valgrind-test.sh
+          name: install dependencies and run valgrind tests
+          no_output_timeout: 10m
+
 orbs:
   azure-cli: circleci/azure-cli@1.0.0      
 
@@ -97,3 +131,30 @@ workflows:
               only: 
                 - /tpch\/.*/ # match with tpch/ prefix
                 - /all_performance_test\/.*/ # match with all_performance_test/ prefix
+
+  # perform weekly valgrind test on azure every monday at 00:00
+  weekly-valgrind:
+    triggers:
+      - schedule:
+          # https://crontab.guru/#0_0_*_*_1
+          cron: "0 0 * * 1"
+          filters:
+            branches:
+              only:
+                - master
+    jobs:
+      - valgrind-test
+
+  # Since valgrind tests really take a long time to finish, wait for 9.5 hours.
+  # Then push valgrind test results and terminate the machine.
+  weekly-valgrind-finalize:
+    triggers:
+      - schedule:
+          # https://crontab.guru/#30_9_*_*_1
+          cron: "30 9 * * 1"
+          filters:
+            branches:
+              only:
+                - master
+    jobs:
+      - finalize-valgrind-test
diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ required for testing citus.
   * [Running PgBench Tests Against Hyperscale (Citus)](#pgbench-cloud)
   * [Running TPC-H Tests](#tpch)
   * [Running TPC-H Tests Against Hyperscale (Citus)](#tpch-cloud)
+  * [Running Valgrind Tests](#valgrind)
 * [Example fab Commands](#fab-examples)
 * [Tasks, and Ordering of Tasks](#fab-tasks)
 * [Task Namespaces](#task-namespaces)
@@ -602,6 +603,88 @@ On the coordinator node:
 fab run.tpch_automate:tpch_q1.ini,connectionURI='postgres://citus:dwVg70yBfkZ6hO1WXFyq1Q@c.fhhwxh5watzbizj3folblgbnpbu.db.citusdata.com:5432/citus?sslmode\=require'
 ```
 
+## <a name="valgrind"></a> Running Valgrind Tests
+
+TL;DR
+
+```bash
+# 1 # start valgrind test
+
+# create valgrind instance to run
+export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here'
+export VALGRIND_TEST=1
+cd azure
+./create-cluster.sh
+
+# connect to coordinator
+eval `ssh-agent -s`
+ssh-add
+./connect.sh
+
+# run fab command in coordinator in a detachable session
+tmux new -d "fab use.postgres:12.3 use.enterprise:enterprise-master run.valgrind"
+
+# simply exit from coordinator after detaching
+
+# 2 # finalize valgrind test
+
+# reconnect to coordinator after 9.5 hours (if you preferred default coordinator configuration)
+export RESOURCE_GROUP_NAME='your-valgrind-test-rg-name-here'
+./connect.sh
+
+# you can first check if valgrind test is finished by attaching to tmux session
+tmux a
+# then you should detach from the session before moving forward
+Ctrl+b d
+
+# run push results script
+cd test-automation/azure
+./push-results.sh <branch name you prefer to push results>
+
+# simply exit from coordinator after pushing the results
+
+# delete resource group finally
+cd azure
+./delete-resource-group.sh
+```
+
+DETAILS:
+
+To create a valgrind instance, following the steps in [Setup Steps For Each Test](#azure-setup-steps), do the following before executing `create-cluster.sh`:
+
+```bash
+export VALGRIND_TEST=1
+```
+
+, which makes `numberOfWorkers` setting useless.
+This is because we will already be using our regression test structure and it creates a local cluster 
+itself. Also, as we install `valgrind` only on coordinator, if we have worker nodes, then we cannot build
+PostgreSQL as we require `valgrind` on workers and get error even if we do not need them.
+
+On the coordinator node:
+
+```bash
+# an example usage: Use PostgreSQL 12.1 and run valgrind test on enterprise/enterprise-master
+fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind
+```
+
+However as valgrind tests take too much time to complete, we recommend you to run valgrind tests in a detached session:
+```bash
+tmux new -d "fab use.postgres:12.1 use.enterprise:enterprise-master run.valgrind"
+```
+
+After the tests are finished (takes up to 9 hours with default coordinator size), re-connect to the coordinator.
+Result can be found under `$HOME/results` directory.
+
+To push the results to `release_test_results` repository, run the below command in coordinator node:
+
+```bash
+sh $HOME/test-automation/azure/push-results.sh <branch_name_to_push>
+```
+
+Finally, delete your resource group.
+Note that automated (weekly) valgrind test already destroys the resources that it uses.
+
 ## <a name="fab-examples"></a> Example fab Commands
 
 Use `fab --list` to see all the tasks you can run! This is just a few examples.

diff --git a/azure/citus-bot.sh b/azure/citus-bot.sh
@@ -40,8 +40,22 @@ trap cleanup EXIT
 
 rg=$1
 export RESOURCE_GROUP_NAME=${rg}
+
+if [ "$rg" == "citusbot_valgrind_test_resource_group" ]; then
+    # If running valgrind tests, export VALGRIND_TEST to be 1 to ensure
+    # only coordinator instance is created in create-cluster script
+    export VALGRIND_TEST=1
+fi
+
 ./create-cluster.sh
 
+if [ "$VALGRIND_TEST" == "1" ]; then
+    # If running valgrind tests, do not run cleanup function
+    # This is because, as valgrind tests requires too much time to run,
+    # we start valgrind tests via nohup in ci. Hence ssh session
+    # will immediately be closed just after the fabric command is run
+    trap - EXIT
+fi
 
 public_ip=$(az group deployment show -g ${rg} -n azuredeploy --query properties.outputs.publicIP.value)
 # remove the quotes 

diff --git a/azure/create-cluster.sh b/azure/create-cluster.sh
@@ -34,7 +34,22 @@ echo "waiting a long time to create cluster, this might take up to 30 mins depen
 # so that $HOME, $PATH are set to the target users $HOME and $PATH.
 export BRANCH=${CIRCLE_BRANCH:=master}
 
-az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH" 
+# below is the default create cluster command
+CREATE_CLUSTER_COMMAND=(az group deployment create -g ${rg} --template-file azuredeploy.json --parameters @azuredeploy.parameters.json --parameters sshPublicKey="${public_key}" branchName="$BRANCH")
+
+# if VALGRIND_TEST variable is not exported, set it to 0
+is_valgrind_test=${VALGRIND_TEST:=0}
+
+# if we want to run valgrind tests, lets overwrite numberOfWorkers parameter with 0
+if [[ "$is_valgrind_test" != "0" ]]; then
+    # be on the safe side, add "--parameters" before "numberOfWorkers" as the order
+    # of the parameters in CREATE_CLUSTER_COMMAND may change
+    CREATE_CLUSTER_COMMAND+=(--parameters)
+    CREATE_CLUSTER_COMMAND+=(numberOfWorkers=0)
+fi
+
+# run CREATE_CLUSTER_COMMAND
+"${CREATE_CLUSTER_COMMAND[@]}"
 
 end_time=`date +%s`
 echo execution time was `expr $end_time - $start_time` s.

diff --git a/azure/finalize-valgrind-test.sh b/azure/finalize-valgrind-test.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# fail if trying to reference a variable that is not set.
+set -u
+# exit immediately if a command fails
+set -e
+# echo commands
+set -x
+
+function cleanup {
+    sh ./delete-resource-group.sh
+}
+
+export RESOURCE_GROUP_NAME="citusbot_valgrind_test_resource_group"
+
+trap cleanup EXIT
+
+public_ip=$(az group deployment show -g ${RESOURCE_GROUP_NAME} -n azuredeploy --query properties.outputs.publicIP.value)
+# remove the quotes 
+public_ip=$(echo ${public_ip} | cut -d "\"" -f 2)
+
+echo ${public_ip}
+
+ssh-keyscan -H ${public_ip} >> ~/.ssh/known_hosts
+chmod 600 ~/.ssh/known_hosts
+
+sh ./delete-security-rule.sh
+
+echo "adding public ip to known hosts in remote"
+ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} "ssh-keyscan -H ${public_ip} >> /home/pguser/.ssh/known_hosts"
+echo "running tests in remote"
+
+# ssh with non-interactive mode does not source bash profile, so we will need to do it ourselves here.
+# put an empty success file for valgrind tests under results dir if there are error logs
+# push the files under results dir
+ssh -o "StrictHostKeyChecking no" -A pguser@${public_ip} \
+"source ~/.bash_profile;" \
+"sh /home/pguser/test-automation/azure/push-results.sh ${RESOURCE_GROUP_NAME}";
diff --git a/azure/push-results.sh b/azure/push-results.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# this scripts pushes the results under results/ directory to release-test-results repository
+
+# args #
+# $1 -> branch name to push results
+
+# fail if trying to reference a variable that is not set.
+set -u
+# exit immediately if a command fails
+set -e
+# fail in a pipeline if any of the commands fails
+set -o pipefail
+
+branch_name=$1
+
+# add github to known hosts
+
+echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts
+
+git clone git@github.com:citusdata/release-test-results.git "${HOME}"/release-test-results
+
+git config --global user.email "citus-bot@microsoft.com" 
+git config --global user.name "citus bot" 
+
+now=$(date +"%m_%d_%Y_%s")
+
+mv "${HOME}"/results "${HOME}"/release-test-results/periodic_job_results/"${now}"
+
+cd "${HOME}"/release-test-results
+
+commit_message="add test results"
+
+git checkout -b "${branch_name}/${now}"
+git add -A 
+git commit -m "$commit_message"
+git push origin "${branch_name}/${now}"
diff --git a/azure/run-all-tests.sh b/azure/run-all-tests.sh
@@ -22,22 +22,23 @@ if [ "$rg_name" = "citusbot_tpch_test_resource_group" ]; then
     fab run.tpch_automate
 fi 
 
-
-# add github to known hosts
-echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts
-
-git clone git@github.com:citusdata/release-test-results.git
-
-git config --global user.email "citus-bot@microsoft.com" 
-git config --global user.name "citus bot" 
-
-now=$(date +"%m_%d_%Y_%s")
-
-mv ${HOME}/results ${HOME}/release-test-results/periodic_job_results/${now}
-
-cd ${HOME}/release-test-results
-
-git checkout -b ${rg_name}/${now}
-git add -A 
-git commit -m "add test results for performance tests ${rg_name}"
-git push origin ${rg_name}/${now}
+# If running valgrind tests, do not run cleanup function
+# This is because, as valgrind tests requires too much time to run,
+# we start valgrind tests via nohup in ci. Hence ssh session 
+# will immediately be closed just after the fabric command is run
+#
+# We have a seperate job to terminate the machine and push the results
+if [ "$rg_name" = "citusbot_valgrind_test_resource_group" ]; then
+    nohup fab use.postgres:13.1 use.enterprise:enterprise-master run.valgrind > /dev/null 2>&1 &
+
+    # wait for cloning to end
+    while ! test -d "$HOME/citus-enterprise";
+    do
+        echo "Wait until citus is cloned completely ...";
+        sleep 60;
+    done
+
+    echo "Citus is cloned succesfully";
+else
+    sh "${HOME}"/test-automation/azure/push-results.sh "$1";
+fi 
diff --git a/fabfile/config.py b/fabfile/config.py
@@ -14,6 +14,19 @@
 RESULTS_DIRECTORY = os.path.join(HOME_DIR, 'results')
 CITUS_INSTALLATION = os.path.join(HOME_DIR, 'citus-installation')
 PORT = 5432
+RELATIVE_REGRESS_PATH = 'src/test/regress'
+
+# keys to access settings dictionary
+REPO_PATH = 'repo_path'
+BUILD_CITUS_FUNC = 'build_citus_func'
+
+# valgrind test variables
+VALGRIND_TEST_OUT_FILE = 'valgrind_test_out.txt'
+VALGRIND_LOGS_FILE = 'valgrind_test_log.txt'
+REGRESSION_DIFFS_FILE = 'regression.diffs'
+CITUS_RELATED_VALGRIND_LOG_FILE = 'valgrind_test_log_citus.txt'
+VALGRIND_REQUIRED_PACKAGES = ['valgrind', 'valgrind-devel.x86_64', 'openssl-devel.x86_64', 'libicu-devel.x86_64']
+VALGRIND_SUCCESS_FNAME = 'valgrind_success'
 
 PG_VERSION = '9.6.1'
 PG_CONFIGURE_FLAGS = ['--with-openssl']