Index: buildbot/google/README.md =================================================================== --- buildbot/google/README.md +++ buildbot/google/README.md @@ -1,3 +1,5 @@ # Configuration files for various buildbots run by Google. * mlir +* cuda + Index: buildbot/google/cuda/README.md =================================================================== --- /dev/null +++ buildbot/google/cuda/README.md @@ -0,0 +1,66 @@ +# CUDA buildbot workers configuration + +This folder contains some of the configuration of the buildbots managed +at Google. The workers are deployed on Google Cloud. + +# The cloud stack + +To deploy build bots workers, we need to create a bunch of virtual machines +on Google Cloud. There are multiple ways to do this. *Terraform* is convenient +as it offers to declare the required machines in config files and then +create/update the machines in the cloud. + +This way we have version control over the infrastructure +and we can review changes before applying them. In case something goes wrong, +we can easily revert changes. It also allows us to copy & paste parts of the +infrastructure for additional machines. + +Internally, Terraform is using *Kubernetes* to manage the deployment of software +to machines. The software installed on the build machines is defined +in *Docker* images. An image is a (layered) file system with all the tools and +settings required for the worker. + +The images are stored in a "registry" (gcr.io in this case) and are then +pulled from the machines where they are executed. The +images can be versioned so that we can pick exactly which version of the image +we want to run. + +The contents of a Docker image is again defined in a config file called +`Dockerfile`. A Dockerfile is a sort of script defining on how to install and +configure the software for a machine. We keep those files in this repository as +well so we can review changes and revert changes if something breaks. + +The docker images also allow contributors to reproduce a failing test locally, +as they will get the same machine configuration as used on the server. Note that +CUDA build bots assume access to particular storage bucket. The parts of the +build script in the docker container will need to be modified in order to work +outside of google. + +## CUDA building/testing orchestration. + +Testing CUDA on multiple architectures runs into a nu,ber of logistical +constraints. We want to test on all major GPU architectures, reasonably quicly +and without spending a lot of money on it. + +Observations: +* Building LLVM is CPU-bound. +* GPUs are relatively expensive. +* GPU/CPU ratio restricts how many CPUs we can have on a VM. +* Preemptible VMs are much cheaper. +* Clean LLVM build takes long time. +* Incremental LLVM builds are relatively quick. +* All buildbots build the same LLVM. + +Decisions: +* Use preemptible VMs. +* Use one VM with a lot of CPU cores to build LLVM and let other bots with + smaller number of cores use it. Eventually we may move the build to a + dedicated VM shared across all bots that test additional functionality. +* save/restore ccache on the build VM to speed up builds on VM preemption + +# Folder structure + +* `docker` - Dockerfiles for the workers and some scripting +* `terraform` - cluster configuration and deployment +* `config.sh` - variables used in other scripts +* `gcloud_config.sh` - configure cloud tooling Index: buildbot/google/cuda/config.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/config.sh @@ -0,0 +1,5 @@ +# config parameters for the Google Cloud, this is used by other scripts +GCP_PROJECT="sanitizer-bots" +GCP_ZONE="us-central1-a" +GCP_CLUSTER="cudabot" +GCR_PREFIX="gcr.io/${GCP_PROJECT}" Index: buildbot/google/cuda/docker/build_deploy.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/build_deploy.sh @@ -0,0 +1,77 @@ +#!/bin/bash +#===-- build_deploy.sh ---------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# This script will deploy a docker image to the registry. +# Arguments: +# +# This updates the `VERSION` file with the latest version number. +#===----------------------------------------------------------------------===// + +set -eu + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +IMAGE_NAME="${1%/}" + +# increment version number +cd "${DIR}/${IMAGE_NAME}" + +# get version numbers from repository +# FIXME: use variables to configure URL +ALL_VERSIONS=$(gcloud container images list-tags gcr.io/sanitizer-bots/${IMAGE_NAME} --format=text | \ + awk '/tags.*:\W+[0-9]+$/ {print $2}' | tail -1) +# read local version number from file and add it to the array +ALL_VERSIONS+=($(cat VERSION)) +# find maximum version number and increment it +VERSION=$(echo "${ALL_VERSIONS[*]}" | tr ' ' '\n' | sort -nr | head -n1) +VERSION=$(( ${VERSION} + 1 )) + +if false; then +# get the git hash and add some suffixes +GIT_HASH=$(git rev-parse HEAD) +if [[ $(git diff --stat) != '' ]]; then + # if working copy is dirty + GIT_HASH+="-dirty-${USER}" +elif [[ $(git --no-pager diff origin/master | wc -l) > 0 ]]; then + # if the hash has not been uploaded to origin/master yet + GIT_HASH+="-local-${USER}" +fi +else +GIT_HASH=c0ffee +fi + +# fully qualified image name +# FIXME: use variables to configure URL +QUALIFIED_NAME="gcr.io/sanitizer-bots/${IMAGE_NAME}" +# tags to be added to the image and pushed to the repository +TAGS=( + "${QUALIFIED_NAME}:latest" + "${QUALIFIED_NAME}:${VERSION}" + "${QUALIFIED_NAME}:${GIT_HASH}" + ) + +# build the image and tag it locally +docker build -t ${IMAGE_NAME}:latest -t ${IMAGE_NAME}:${VERSION} . + +# print the list of tags to be pushed +echo "-----------------------------------------" +echo "image version: ${VERSION}" +echo "tags:" +printf ' %s\n' "${TAGS[@]}" +echo "-----------------------------------------" +read -p "Push to registry? [yN]" -n 1 -r +echo + +if [[ $REPLY =~ ^[Yy]$ ]] +then + for TAG in "${TAGS[@]}" + do + docker tag ${IMAGE_NAME}:${VERSION} "${TAG}" + docker push "${TAG}" + done + # store the version number + echo "${VERSION}" > VERSION +fi Index: buildbot/google/cuda/docker/build_run.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/build_run.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#===-- build_run.sh ------------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# This script will deploy a docker image to the registry. +# Arguments: +# +# +# optional: +#===----------------------------------------------------------------------===// + +set -eux + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +IMAGE_NAME="${1%/}" +SECRET_STORAGE="$2" +CMD= +if [ "$#" -eq 3 ]; +then + CMD="$3" +fi + +cd "${DIR}/${IMAGE_NAME}" + +docker build -t "${IMAGE_NAME}:latest" . +docker run -it -v "${SECRET_STORAGE}":/secrets -e BUILDBOTS=cuda-gce-test-t4-0 \ + --tmpfs=/memfs:exec \ + "${IMAGE_NAME}" ${CMD} Index: buildbot/google/cuda/docker/cudabot/Dockerfile =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/cudabot/Dockerfile @@ -0,0 +1,70 @@ +# There is already an Ubuntu image with cuda :) +FROM nvidia/cuda:10.2-base + +# for the host configuration see: +# https://github.com/NVIDIA/nvidia-docker + +# install build tools +# set -eux;\ +RUN apt-get update; \ + apt-get install -y software-properties-common apt-transport-https \ + ca-certificates ninja-build cuda-compat-11-0 \ + python-virtualenv python-pip python3-pip \ + python-psutil git zstd wget gnupg ccache 'libstdc++-*-dev' \ + && wget -qO- "https://raw.githubusercontent.com/chromium/chromium/master/tools/clang/scripts/update.py" \ + | /usr/bin/python - --output-dir=/usr/local/clang \ + && update-alternatives --install /usr/bin/clang clang /usr/local/clang/bin/clang 200 \ + && update-alternatives --install /usr/bin/clang++ clang++ /usr/local/clang/bin/clang++ 200 \ + && update-alternatives --install /usr/bin/lld lld /usr/local/clang/bin/lld 200 \ + --slave /usr/bin/ld.lld ld.lld /usr/local/clang/bin/lld + +# install cuda +RUN wget --progress=bar:force:noscroll http://developer.download.nvidia.com/compute/cuda/11.0.1/local_installers/cuda_11.0.1_450.36.06_linux.run \ + && bash cuda_11.0.1_450.36.06_linux.run --silent --defaultroot=/usr/local/cuda-11.0 --toolkit --toolkitpath=/usr/local/cuda-11.0 \ + && rm cuda_11.0.1_450.36.06_linux.run \ + && ( cd /usr/local/cuda-11.0 \ + && rm -rf *nsight* doc lib64/*sparse* lib64/*solver* lib64/*fft* lib64/*_static.a ) +RUN wget --progress=bar:force:noscroll http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run \ + && bash cuda_10.2.89_440.33.01_linux.run --silent --defaultroot=/usr/local/cuda-10.2 --toolkit --toolkitpath=/usr/local/cuda-10.2 \ + && rm cuda_10.2.89_440.33.01_linux.run \ + && ( cd /usr/local/cuda-10.2 \ + && rm -rf *nsight* doc lib64/*sparse* lib64/*solver* lib64/*fft* lib64/*_static.a ) + +RUN wget -qO- https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | apt-key --keyring /etc/apt/trusted.gpg.d/kitware.gpg add - \ + && apt-add-repository "deb http://packages.cloud.google.com/apt cloud-sdk main" \ + && apt-get update \ + && apt-get install -y google-cloud-sdk + +RUN pip install buildbot-slave==0.8.12 \ + && pip3 install lit + +RUN groupadd -g 999 builder \ + && useradd -r -u 999 -g builder -d /buildbot builder \ + && mkdir /buildbot \ + && chown builder:builder /buildbot + +# Ubuntu ships with old cmake version, install the latest one +# from https://apt.kitware.com/ +RUN wget -qO - https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-x86_64.tar.gz \ + | tar -C /usr/local --strip-components=1 -zxf - + +# +RUN apt-get install -y dstat + +USER builder + +# Speed up git clone on restart +RUN git config --global pack.threads 8 +# Pre-stage test-suite repo with huge sources removed. +RUN git clone "https://github.com/llvm/llvm-test-suite.git" /buildbot/llvm-test-suite \ + && rm -rf /buildbot/llvm-test-suite/{ABI-Testsuite,Bitcode,MultiSource} + +ENV PATH=$PATH:/buildbot +ENV LD_LIBRARY_PATH=/usr/local/cuda-11.0/compat:$LD_LIBRARY_PATH +COPY bootstrap.sh /buildbot +COPY cuda-build.sh /buildbot/cuda-build +# Test bots run the same script for now. +COPY cuda-build.sh /buildbot/cuda-test +COPY external.py /buildbot +CMD /buildbot/bootstrap.sh Index: buildbot/google/cuda/docker/cudabot/VERSION =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/cudabot/VERSION @@ -0,0 +1 @@ +28 Index: buildbot/google/cuda/docker/cudabot/bootstrap.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/cudabot/bootstrap.sh @@ -0,0 +1,74 @@ +#! /bin/bash + +# Read the worker password from a mounted file. +_BUILDBOT_PASSWD=$(cat /secrets/token) +# buildbot logs environment vars. Unset the password so it does not leak. +unset BUILDBOT_PASSWD + +BUILDBOT_MASTER="${BUILDBOT_MASTER:-lab.llvm.org:9994}" + +# It looks like GKE sometimes deploys the container before the NVIDIA drivers +# are loaded on the host. In this case the GPU is not available during the +# entire lifecycle of the container. Not sure how to fix this properly. + +RETURN_CODE=$(nvidia-smi > /dev/null ; echo $?) +if [[ "$RETURN_CODE" != "0" ]] ; then + echo "ERROR: Failed to access NVIDIA graphics card." \ + | tee /dev/termination-log + echo "Exiting in 30 secs..." + sleep 30 + exit 1 +fi + +cd /buildbot/ + +for name in $BUILDBOTS ; do + buildslave create-slave \ + "${name}" "${BUILDBOT_MASTER}" \ + "${name}" "${_BUILDBOT_PASSWD}" + + # populate host info. + ( + uname -a ; + cat /proc/cpuinfo | grep "model name" | head -n1 | cut -d " " -f 3- ; + echo "number of cores: $(nproc)" ; + nvidia-smi -L | cut -d "(" -f 1 ; + lsb_release -d | cut -f 2- ; + clang --version | head -n1 ; + ld.lld --version ; + cmake --version | head -n1 + ) > ${name}/info/host + echo "Artem Belevich " > "${name}/info/admin" + + buildslave start "${name}" + # tail logs and exit when buildslave is done. + TAIL_ARGS="${TAIL_ARGS} -f ${name}/twistd.log --pid $(cat ${name}/twistd.pid)" + BUILDBOT="$name" +done + + +# log all bots specified on the command line. +if [ "${TAIL_ARGS}" = "" ]; then + echo "No build bots specified in BUILDBOTS environment" \ + | tee /dev/termination-log + exit +fi + +# Restore ccache +CCACHE_SNAPSHOT="gs://cudabot-gce-artifacts/ccache-snapshot-${BUILDBOT}.tar.zst" +# Record snapshot location for future updates. +echo "${CCACHE_SNAPSHOT}" > "$HOME/ccache_snapshot.uri" +# This is a best-effort attempt to populate ccache. Remove ccache on any failures. +(gsutil cp "${CCACHE_SNAPSHOT}" - | zstd -d | tar -C $HOME -xf -) \ + || rm -rf ${HOME}/ccache + +# If we've got some bots running, tail their logs. +# Leave a tombstone termination log message in case we get terminated. +echo "Buildbot terminated unexpectedly" > /dev/termination-log +tail ${TAIL_ARGS} +echo "Buildbot has finished. " + +while [ -f $HOME/DO_NOT_QUIT ]; do + echo "DO_NOT_QUIT file is present. Staying alive." + sleep 60 +done Index: buildbot/google/cuda/docker/cudabot/cuda-build.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/cudabot/cuda-build.sh @@ -0,0 +1,212 @@ +#!/bin/bash + +set -euE +set -o pipefail +trap 'kill $$' ERR + +# Stop if we've encountered an error. +echo "@@@HALT_ON_FAILURE@@@" + +function step() { + echo "@@@BUILD_STEP ${@}@@@" +} + +function step_summary() { + echo "@@@STEP_SUMMARY_TEXT@${@}@@@" +} + +function step_exception() { + echo "@@@STEP_EXCEPTION@@@" +} + +function run() { + echo ">>> " "${@}" + "${@}" +} + +BUILDBOT_DIR=$(readlink -f ..) +LLVM_TREE="${BUILDBOT_DIR}/llvm-project" +REVISION=${BUILDBOT_REVISION:-origin/master} +NPROC=$(nproc) +# By default build for all major architectures. +GPU_ARCH=${GPU_ARCH:="sm_35;sm_60;sm_75"} +# K80 sometimes hangs/deadlocks on parallal jobs. Allow overriding it. +CUDA_TEST_JOBS=${CUDA_TEST_JOBS:-4} + +if [ ! -d "${LLVM_TREE}" ]; then + step "Checking out LLVM tree." + run git clone --shallow-since="1 week ago" --progress \ + https://github.com/llvm/llvm-project.git \ + ${LLVM_TREE} + run touch ${BUILDBOT_DIR}/.we_own_llvm_tree +fi + +if [ -f ${BUILDBOT_DIR}/.we_own_llvm_tree ]; then + step "Updating LLVM tree" + run git -C "${LLVM_TREE}" fetch origin + run git -C "${LLVM_TREE}" reset --hard ${REVISION} +fi + +step "Setting up the build." +LLVM_DIR="${LLVM_TREE}/llvm" +MEMFS_DIR="/memfs" +BUILD_BASE_DIR="${MEMFS_DIR}/build" +BUILD_DIR="${BUILD_BASE_DIR}/${BUILDBOT_SLAVENAME}/${BUILDBOT_BUILDERNAME}" +export DESTDIR=${BUILD_DIR}/install +echo BUILD_DIR=${BUILD_DIR} +echo LLVM_DIR="${LLVM_DIR}" +run rm -rf "${BUILD_DIR}" +export TMPDIR=${MEMFS_DIR}/tmp +run mkdir -p ${TMPDIR} + +# CCACHE is re-populated from the snapshot by bootstrap script. +export CCACHE_DIR="${HOME}/ccache" +run mkdir -p "${CCACHE_DIR}" # In case there's no bootstrapped ccache. +run ccache -M 5GB +CCACHE_SNAPSHOT=$(cat $HOME/ccache_snapshot.uri) +echo "CCACHE snapshot location: ${CCACHE_SNAPSHOT}" + +# Check out/update test suite. +TESTSUITE_DIR="${HOME}/llvm-test-suite" +if [ -d "${TESTSUITE_DIR}" ]; then + step "Update LLVM test suite" + run git -C "${TESTSUITE_DIR}" fetch + run git -C "${TESTSUITE_DIR}" reset --hard origin/master +else + step "Check out LLVM test suite" + run git clone --progress "https://github.com/llvm/llvm-test-suite.git" "${TESTSUITE_DIR}" +fi + +EXT_DIR="${BUILD_DIR}/externals/cuda" +run rm -rf "${EXT_DIR}" +run mkdir -p "${EXT_DIR}" + +# Creates fake GCC installation +function create_fake_gcc_install() { + VERSION=$1 + DIR=${EXT_DIR}/gcc-${VERSION} + mkdir -p ${DIR}/include/c++ + mkdir -p ${DIR}/bin + ln -s /usr/include/c++/$VERSION ${DIR}/include/c++/$VERSION + # Work around https://github.com/ninja-build/ninja/issues/1330 + # Otherwise ninja collapses '..' in the deps paths and can't find the headers. + ln -s /usr/include/x86_64-linux-gnu ${DIR}/include + mkdir -p ${DIR}/lib/gcc/x86_64-unknown-linux-gnu + ln -s /usr/lib/gcc/x86_64-linux-gnu/$VERSION ${DIR}/lib/gcc/x86_64-unknown-linux-gnu/$VERSION + cat <${DIR}/bin/gcc +#! /bin/bash +clang++ --gcc-toolchain=\$(dirname \$0)/.. "\$@" +EOF + chmod a+x ${DIR}/bin/gcc +} + +# Create a fake GCC installation for every libstdc++ version we've found. +for dir in $(find /usr/include/c++/ -mindepth 1 -maxdepth 1 -type d); do + v=$(basename $dir) + create_fake_gcc_install $v +done + +# Set up links to CUDA variants we have installed. +for dir in $(find /usr/local -mindepth 1 -maxdepth 1 -type d -name cuda-\*); do + ln -s $dir ${EXT_DIR} +done + +function build_and_test() { + step "Configure Clang" + LLVM_BUILD_DIR="${BUILD_DIR}/llvm" + run rm -rf "${LLVM_BUILD_DIR}" + run mkdir -p "${LLVM_BUILD_DIR}" + run cd "${LLVM_BUILD_DIR}" + run cmake -G Ninja -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_CCACHE_BUILD=ON -DLLVM_USE_LINKER=lld \ + -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi;libunwind" \ + -DCMAKE_CXX_COMPILER="clang++" -DCMAKE_C_COMPILER="clang" \ + -DLLVM_TOOL_CLANG_TOOLS_EXTRA_BUILD=false \ + -DLLVM_LIT_ARGS="-v -vv" \ + ${LLVM_DIR} + + step "Building LLVM & Clang". + run ninja + run rm -rf "${DESTDIR}" + run ninja install + + # Save LLVM/clang binaries to be reused by other bots. + if [ "${BUILDBOT_BUILDERNAME}" = "clang-cuda-gce-test-t4" ]; then + echo "Uploading installed binaries to Cloud" + echo "Revision: ${BUILDBOT_REVISION}" + tar -C "${DESTDIR}" -cf - . \ + | zstd -3 -T20 \ + | gsutil -o GSUtil:parallel_composite_upload_threshold=150M \ + cp - gs://cudabot-gce-artifacts/llvm-${BUILDBOT_REVISION}.tar.zst + # Save ninja build log for performance analysis. + gsutil cp .ninja_log gs://cudabot-gce-artifacts/llvm-${BUILDBOT_REVISION}.ninja_log + fi + + # Save ccache snapshot. + tar -C "${HOME}" -cf - ccache \ + | zstd -3 -T20 \ + | gsutil -o GSUtil:parallel_composite_upload_threshold=150M \ + cp - "${CCACHE_SNAPSHOT}" + + step "Testing LLVM" + run ninja check-llvm + + step "Testing Clang" + run ninja check-clang +} + +function fetch_prebuilt_clang () { + local revision="$1" + local destdir="$2" + local timeout="10 minutes" # A bit longer than a typical fast bot build time. + local endtime=$(date -ud "$timeout" +%s) + local snapshot="gs://cudabot-gce-artifacts/llvm-${revision}.tar.zst" + + step "Waiting for LLVM & Clang snapshot to be built. " + while [[ $(date -u +%s) -le $endtime ]] + do + if gsutil ls -l "${snapshot}" ; then + mkdir -p "${destdir}" + gsutil cp ${snapshot} - | zstd -d | tar -C "${destdir}" -xf - + # We've got the snapshot and are done here. + return + fi + echo "$(date +%H:%M:%S) No snapshot available yet. Still waiting." + sleep 20 + done + # We've timed out waiting for the snapshot. Bail out and try again. + step_summary "Timed out waiting for LLVM binaries to be built." + step_exception + exit 0 +} + +if [[ $NPROC -gt 16 ]] ; then + build_and_test +else + # Machine is too slow. Wait for some other bot to build us a copy. + fetch_prebuilt_clang "${BUILDBOT_REVISION}" "${DESTDIR}" +fi + +step "Configuring CUDA test-suite" +TEST_BUILD_DIR=${BUILD_DIR}/test-suite-build +run rm -rf ${TEST_BUILD_DIR} +run mkdir -p ${TEST_BUILD_DIR} +run cd ${TEST_BUILD_DIR} +run cmake -G Ninja -DTEST_SUITE_SUBDIRS=External \ + -DTEST_SUITE_EXTERNALS_DIR=${EXT_DIR}/.. \ + -DTEST_SUITE_COLLECT_CODE_SIZE=OFF \ + -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF \ + -DCUDA_GPU_ARCH="${GPU_ARCH}" \ + -DCUDA_JOBS=${CUDA_TEST_JOBS} \ + -DCMAKE_CXX_COMPILER="${DESTDIR}/usr/local/bin/clang++" \ + -DCMAKE_C_COMPILER="${DESTDIR}/usr/local/bin/clang" \ + ${TESTSUITE_DIR} + +step "Building CUDA test-suite" +run ninja cuda-tests-simple + +step "Testing CUDA test-suite" +run ninja check-cuda-simple + +exit 0 Index: buildbot/google/cuda/docker/cudabot/shutdown-script.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/docker/cudabot/shutdown-script.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -euE +set -o pipefail +trap 'kill $$' ERR + Index: buildbot/google/cuda/gcloud_config.sh =================================================================== --- /dev/null +++ buildbot/google/cuda/gcloud_config.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eux + +ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# load project configuration +source "${ROOT_DIR}/config.sh" + +gcloud config set project ${GCP_PROJECT} +gcloud config set compute/zone ${GCP_ZONE} +gcloud auth configure-docker +gcloud container clusters get-credentials $GCP_CLUSTER Index: buildbot/google/cuda/terraform/README.md =================================================================== --- /dev/null +++ buildbot/google/cuda/terraform/README.md @@ -0,0 +1,50 @@ +This folder contains the Terraform configuration to spawn the build bots. + +Before deploying anything new, use `terraform plan` to check that you're only +modifying the parts that you intended to. + + +# Installation + +To set up your local machine to deploy changes to the cluster follow these +steps: + +1. Install these tools: + 1. [Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) + 1. [Google Cloud SDK](https://cloud.google.com/sdk/install) + 1. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +1. Run `llvm-zorg/buildbot/google/gcloud_config.sh` to configure the Google + Cloud SDK. +1. To configure the GCP credetianls for terraform run: + ```bash + export GOOGLE_CREDENTIALS=~/.config/gcloud/legacy_credentials//adc.json + ``` + +# Deploying to new Google Cloud project + +When deploying this cluster to a completely new Google Cloud project, these +manual steps are required: + +* You need to create the GCP project manually before Terraform works. +* You also need to go to the Kubernetes page once, to enable Kubernetes and + Container Registry for that project. +* GPUs need to be enabled on Kubernetes by following these +[instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers). + + +# Secrets + +To keep secrets a secret, they MUST not be stored in version control. The right +place on kubernetes is a "secret". To create a kubernetes secret for the agent +token: `bash kubectl create secret generic buildbot-token-cudabot-gce +--from-literal=token=` The file in `` then must contain the +password of the buildbot worker in plain text. In the `Deployment` of a +container, the secret is defined as a special type of volume and mounted in the +specified path. During runtime the secret can then be read from that file. + +An example: The secret `buildbot-token-cudabot-gce` is defined (as above) in +Kubernetes. In the [deployment](buildbot/google/terraform/main.tf) `mlir-nvidia` +it is used as a volume of type `secret` and then mounted at `/secrets`. During +the runtime of the docker container, the script +[run.sh](../docker/buildbot-cudabot-gce/bootstrap.sh) reads the secret from the +file `/secrets/token` and uses it to create the worker configuration. Index: buildbot/google/cuda/terraform/cudabot-deployment-k80.yaml =================================================================== --- /dev/null +++ buildbot/google/cuda/terraform/cudabot-deployment-k80.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cudabot-k80 +spec: + # number of instances we want to run + replicas: 1 + selector: + matchLabels: + app: cudabot-k80 + template: + metadata: + labels: + app: cudabot-k80 + spec: + containers: + # the image and version we want to run + - image: "gcr.io/sanitizer-bots/cudabot:28" + name: cudabot-gce + # reserve "-1" for this image, kubernetes also + # needs <1 core for management tools + resources: + limits: + cpu: "7" + memory: 26Gi + # also request to use the GPU + nvidia.com/gpu: "1" + requests: + cpu: "7" + memory: 26Gi + nvidia.com/gpu: "1" + env: + - name: "BUILDBOTS" + value: "cuda-gce-test-k80-0" + - name: "GPU_ARCH" + value: "sm_35" + # K80 tends to deadlock on parallel tests. Run tests one at a time. + - name: "CUDA_TEST_JOBS" + value: "1" + volumeMounts: + # mount the secrets into a folder + - mountPath: /secrets + mountPropagation: None + name: buildbot-token + # Add tmpfs for build/test objects. + - mountPath: /memfs + mountPropagation: None + name: buildbot-memfs + # specify the nood pool on which to deploy + nodeSelector: + pool: nvidia-k80-pool + restartPolicy: Always + # FIXME: do we need this if we requested a GPU? + #tolerations: + #- effect: NoSchedule + # key: nvidia.com/gpu + # operator: Equal + # value: present + volumes: + # declare the secret as a volume so we can mount it + - name: buildbot-token + secret: + optional: false + secretName: buildbot-token-cudabot-gce + - name: buildbot-memfs + emptyDir: + medium: Memory Index: buildbot/google/cuda/terraform/cudabot-deployment-p4.yaml =================================================================== --- /dev/null +++ buildbot/google/cuda/terraform/cudabot-deployment-p4.yaml @@ -0,0 +1,72 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cudabot-p4 +spec: + # number of instances we want to run + replicas: 1 + selector: + matchLabels: + app: cudabot-p4 + template: + metadata: + labels: + app: cudabot-p4 + spec: + containers: + # the image and version we want to run + - image: "gcr.io/sanitizer-bots/cudabot:28" + name: cudabot-gce + # reserve "-1" for this image, kubernetes also + # needs <1 core for management tools + resources: + limits: + cpu: "7" + memory: 26Gi + # also request to use the GPU + nvidia.com/gpu: "1" + requests: + cpu: "7" + memory: 26Gi + nvidia.com/gpu: "1" + env: + - name: "BUILDBOTS" + value: "cuda-gce-test-p4-0" + - name: "GPU_ARCH" + value: "sm_60" + volumeMounts: + # mount the secrets into a folder + - mountPath: /secrets + mountPropagation: None + name: buildbot-token + # Add tmpfs for build/test objects. + - mountPath: /memfs + mountPropagation: None + name: buildbot-memfs + lifecycle: + postStart: + exec: + command: ["/bin/sh", "-c", "echo Hello from the postStart handler > /tmp/message"] + preStop: + exec: + command: ["/bin/sh","-c","echo XXX We are going down. > /dev/termination-log"] + # specify the nood pool on which to deploy + nodeSelector: + pool: nvidia-p4-pool + restartPolicy: Always + # FIXME: do we need this if we requested a GPU? + #tolerations: + #- effect: NoSchedule + # key: nvidia.com/gpu + # operator: Equal + # value: present + volumes: + # declare the secret as a volume so we can mount it + - name: buildbot-token + secret: + optional: false + secretName: buildbot-token-cudabot-gce + - name: buildbot-memfs + emptyDir: + medium: Memory Index: buildbot/google/cuda/terraform/cudabot-deployment-t4.yaml =================================================================== --- /dev/null +++ buildbot/google/cuda/terraform/cudabot-deployment-t4.yaml @@ -0,0 +1,65 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cudabot-t4 +spec: + # number of instances we want to run + replicas: 1 + selector: + matchLabels: + app: cudabot-t4 + template: + metadata: + labels: + app: cudabot-t4 + spec: + containers: + # the image and version we want to run + - image: "gcr.io/sanitizer-bots/cudabot:28" + name: cudabot-gce + # reserve "-1" for this image, kubernetes also + # needs <1 core for management tools + resources: + limits: + cpu: "23" + memory: 26Gi + # also request to use the GPU + nvidia.com/gpu: "1" + requests: + cpu: "23" + memory: 26Gi + nvidia.com/gpu: "1" + env: + - name: "BUILDBOTS" + value: "cuda-gce-test-t4-0" + - name: "GPU_ARCH" + value: "sm_75" + volumeMounts: + # mount the secrets into a folder + - mountPath: /secrets + mountPropagation: None + name: buildbot-token + # Add tmpfs for build/test objects. + - mountPath: /memfs + mountPropagation: None + name: buildbot-memfs + # specify the nood pool on which to deploy + nodeSelector: + pool: nvidia-t4-pool + restartPolicy: Always + # FIXME: do we need this if we requested a GPU? + #tolerations: + #- effect: NoSchedule + # key: nvidia.com/gpu + # operator: Equal + # value: present + volumes: + # declare the secret as a volume so we can mount it + - name: buildbot-token + secret: + optional: false + secretName: buildbot-token-cudabot-gce + - name: buildbot-memfs + emptyDir: + medium: Memory Index: buildbot/google/cuda/terraform/main.tf =================================================================== --- /dev/null +++ buildbot/google/cuda/terraform/main.tf @@ -0,0 +1,246 @@ + +# configure Google Cloud project +provider "google" { + project = var.gcp_config.project + region = var.gcp_config.region +} + +resource "null_resource" "update_cluster" { + # Add NVIDIA driver daemonset. + depends_on = [google_container_cluster.cudabot_cluster] + # Update kubectl context for the cluster and apply nvidia's daemonset, + provisioner "local-exec" { + command = <