diff --git a/buildbot/google/README.md b/buildbot/google/README.md new file mode 100644 --- /dev/null +++ b/buildbot/google/README.md @@ -0,0 +1,39 @@ +# LLVM buildbot workers configuration + +This folder contains some of the configuration of the buildbots managed +at Google. The workers are deployed on Google Cloud. + + +# The cloud stack + +To deploy build bots workers, we need to create a bunch of virtual machines +on Google Cloud. There are multiple ways to do this. *Terraform* is convenient +as it offers to declare the required machines in config files and then +create/update the machines in the cloud. + +This way we have version control over the infrastructure +and we can review changes before applying them. In case something goes wrong, +we can easily revert changes. It also allows us to copy & past parts of the +infrastructure for additional machines. + +Internally, Terraform is using *Kubernetes* to manage the deployment of software +to machines. The software installed on the build machines is defined +in *Docker* images. An image is a (layered) file system with all the tools and +settings required for the worker. + +The images are stored in a "regsitry" (gcr.io in this case) and are then +pulled from the machines where they are executed. The +images can be versioned so that we can pick exactly which version of the image +we want to run. + +The contets of a Docker image is again defined in a config file called +`Dockerfile`. A Dockerfile is a sort of script defining on how to install and +configure the software for a machine. We keep those files in this repositry as +well so we can review changes and revert changes if something breaks. + +# Folder structure + +* `docker` - Dockerfiles for the workers and some scripting +* `terraform` - cluster configuration and deployment +* `config.sh` - variables used in other scripts +* `gcloud_config.sh` - configure cloud tooling \ No newline at end of file diff --git a/buildbot/google/config.sh b/buildbot/google/config.sh new file mode 100644 --- /dev/null +++ b/buildbot/google/config.sh @@ -0,0 +1,5 @@ +# config parameters for the Google Cloud, this is used by other scripts +GCP_PROJECT="sanitizer-bots" +GCP_ZONE="us-central1-a" +GCP_CLUSTER="buildbot-cluster" +GCR_PREFIX="gcr.io/${GCP_PROJECT}" \ No newline at end of file diff --git a/buildbot/google/docker/README.md b/buildbot/google/docker/README.md new file mode 100644 --- /dev/null +++ b/buildbot/google/docker/README.md @@ -0,0 +1,14 @@ +This folder contains the Dockerfiles and scripts used for some of the +buildbot workers. + +# Scripts + +This folder also contains some scripts that are useful in working with the +docker images. + +## build_run.sh +Build a docker image and run it locally + +## build_deploy.sh +Build a docker image, increment the version number, tag it and upload it to +the registry. This updates the `VERSION` file to track the version numbers. \ No newline at end of file diff --git a/buildbot/google/docker/build_deploy.sh b/buildbot/google/docker/build_deploy.sh new file mode 100755 --- /dev/null +++ b/buildbot/google/docker/build_deploy.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#===-- build_deploy.sh ---------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# This script will deploy a docker image to the registry. +# Arguments: +# +# This updates the `VERSION` file with the latest version number. +#===----------------------------------------------------------------------===// + +set -eu + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# get config options + +IMAGE_NAME="${1%/}" + +# increment version number +cd "${DIR}/${IMAGE_NAME}" +VERSION=$(( $(cat VERSION) + 1 )) +echo "image version: ${VERSION}" + +docker build -t ${IMAGE_NAME}:latest -t ${IMAGE_NAME}:${VERSION} . + +read -p "Push to registry? [yN]" -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]] +then + QUALIFIED_NAME="gcr.io/sanitizer-bots/${IMAGE_NAME}" + docker tag ${IMAGE_NAME}:${VERSION} ${QUALIFIED_NAME}:${VERSION} + docker tag ${IMAGE_NAME}:latest ${QUALIFIED_NAME}:latest + docker push ${QUALIFIED_NAME}:${VERSION} + docker push ${QUALIFIED_NAME}:latest +fi + +echo "${VERSION}" > VERSION diff --git a/buildbot/google/docker/build_run.sh b/buildbot/google/docker/build_run.sh new file mode 100755 --- /dev/null +++ b/buildbot/google/docker/build_run.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#===-- build_run.sh ------------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# This script will deploy a docker image to the registry. +# Arguments: +# +# +# optional: +#===----------------------------------------------------------------------===// + +set -eux + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +IMAGE_NAME="${1%/}" +SECRET_STORAGE="$2" +CMD= +if [ "$#" -eq 3 ]; +then + CMD="$3" +fi + +cd "${DIR}/${IMAGE_NAME}" + +docker build -t "${IMAGE_NAME}" . +docker run -it -v "${SECRET_STORAGE}":/secrets "${IMAGE_NAME}" ${CMD} diff --git a/buildbot/google/docker/buildbot-mlir-nvidia/Dockerfile b/buildbot/google/docker/buildbot-mlir-nvidia/Dockerfile new file mode 100644 --- /dev/null +++ b/buildbot/google/docker/buildbot-mlir-nvidia/Dockerfile @@ -0,0 +1,57 @@ +#===-- Dockerfile --------------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# Docker image used for the mlir-nvidia builder +#===----------------------------------------------------------------------===// + +# Use the image from NVIDIA as base +FROM nvidia/cuda:10.2-base-ubuntu18.04 + + +# install build tools +RUN apt-get update; \ + apt-get install -y software-properties-common apt-transport-https ca-certificates \ + clang-8 lld-8 ninja-build git wget gnupg ccache \ + python python-pip python-psutil ;\ + update-alternatives --install /usr/bin/clang clang /usr/bin/clang-8 100 ;\ + update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-8 100 ;\ + update-alternatives --install /usr/bin/lld lld /usr/bin/lld-8 100 + +# install cuda +# avoid popups for keyboard configurations +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cuda + +# Ubuntu ships with old cmake version, install the latest one +# from https://apt.kitware.com/ +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ + gpg --dearmor - | \ + tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null ;\ + apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' ;\ + apt-get update ;\ + apt-get install -y cmake + +# install (old) build bot version +# this version of build bot requires python2! +RUN pip install buildbot-slave==0.8.11 + +# Volume to mount secrets into the container +VOLUME /secrets + +# create user account, some test fail if run as root +RUN useradd buildbot --create-home +WORKDIR /home/buildbot +USER buildbot + +# copy startup script +COPY run.sh /home/buildbot/ + +ENV WORKER_NAME="mlir-nvidia" + +# Set up buildbot host and maintainer info. +RUN mkdir -p "${WORKER_NAME}/info/" ;\ + echo "Christian Kühnel " > "${WORKER_NAME}/info/admin" + +CMD ./run.sh diff --git a/buildbot/google/docker/buildbot-mlir-nvidia/VERSION b/buildbot/google/docker/buildbot-mlir-nvidia/VERSION new file mode 100644 --- /dev/null +++ b/buildbot/google/docker/buildbot-mlir-nvidia/VERSION @@ -0,0 +1 @@ +3 diff --git a/buildbot/google/docker/buildbot-mlir-nvidia/run.sh b/buildbot/google/docker/buildbot-mlir-nvidia/run.sh new file mode 100755 --- /dev/null +++ b/buildbot/google/docker/buildbot-mlir-nvidia/run.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#===-- run.sh -------------------------------------------------------------===// +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# This script will start the buildbot worker +#===----------------------------------------------------------------------===// + +set -eu + +# Read the worker password from a mounted file. +WORKER_PASSWORD=$(cat /secrets/token) + +# generate the host information of this worker +( + uname -a ; \ + cat /proc/cpuinfo | grep "model name" | head -n1 | cut -d " " -f 3- ;\ + echo "number of cores: $(nproc)" ;\ + nvidia-smi -L | cut -d "(" -f 1 ;\ + lsb_release -d | cut -f 2- ; \ + clang --version | head -n1 ; \ + ld.lld-8 --version ; \ + cmake --version | head -n1 +) > ${WORKER_NAME}/info/host + +# FIXME(kuhnel): +# It looks like GKE sometimes deploys the container before the NVIDIA drivers +# are loaded on the host. In this case the GPU is not available during the +# entire lifecycle of the container. Not sure how to fix this properly. +# Maybe the above entry is enough as it depends on a working `nvidia-smi`. +# If not a workaround might be to check for the graphics card in this script and +# exit immediately if it's not available. + +# create the folder structure +buildslave create-slave --keepalive=200 "${WORKER_NAME}" \ + lab.llvm.org:9994 "${WORKER_NAME}" "${WORKER_PASSWORD}" + +# start the daemon, this command return immetiately +buildslave start "${WORKER_NAME}" + +# To keep the container running and produce log outputs: dump the worker +# log to stdout +tail -f ${WORKER_NAME}/twistd.log diff --git a/buildbot/google/gcloud_config.sh b/buildbot/google/gcloud_config.sh new file mode 100644 --- /dev/null +++ b/buildbot/google/gcloud_config.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -eux + +source config + +gcloud config set project ${GCP_PROJECT} +gcloud config set compute/zone ${GCP_ZONE} +gcloud auth configure-docker +gcloud container clusters get-credentials $GCP_CLUSTER \ No newline at end of file diff --git a/buildbot/google/terraform/README.md b/buildbot/google/terraform/README.md new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/README.md @@ -0,0 +1,41 @@ +This folder contains the Terraform configuration to spawn the build bots. + +Before deploying anything new, use `terraform plan` to check that you're only +modifying the parts that you intended to. + + +# Installation + +To set up your local machine to deploy changes to the cluster follow these +steps: + +1. Install these tools: + 1. [Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) + 1. [Google Cloud SDK](https://cloud.google.com/sdk/install) + 1. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +1. Run `llvm-zorg/buildbot/google/gcloud_config.sh` to configure the Google + Cloud SDK. +1. To configure the GCP credetianls for terraform run: + ```bash + export GOOGLE_CREDENTIALS=~/.config/gcloud/legacy_credentials//adc.json + ``` + +# Deploying to new Google Cloud project + +When deploying this cluster to a completely new Google Cloud project, these manual steps are required: + +* You need to create the GCP project manually before Terraform works. +* You also need to go to the Kubernetes page once, so enable Kubernetes for + that project. +* GPUs need to be enabled on Kubernetes by following these +[instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers). + + +# Secrets + +To keep secrets a secret, they MUST not be stored in version control. The right +place on kubernetes is a "secret". To create a kubernetes secret for the agent +token: +```bash +kubectl create secret generic buildbot-token-mlir-nvidia --from-file=token= +``` diff --git a/buildbot/google/terraform/main.tf b/buildbot/google/terraform/main.tf new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/main.tf @@ -0,0 +1,141 @@ + +# configure Google Cloud project +provider "google" { + project = var.gcp_config.project + region = var.gcp_config.region +} + +# Create the cluster runningn all Kubernetes services +resource "google_container_cluster" "primary" { + name = "buildbot-cluster" + # maybe have a regional cluster for Kubernetes, as we depend on this... + location = var.gcp_config.zone_a + + # one node is enough (at the moment) + initial_node_count = 1 + + node_config { + # FIXME(kuhnel): turn this into a private cluster, without external IP + # We need at least 2 vCPU to run all kubernetes services + machine_type = "n1-standard-2" + # use preemptible, as this saves costs + preemptible = true + } +} + +# Create machines for mlir-nvidia +resource "google_container_node_pool" "nvidia_16core_pool_nodes" { + name = "nvidia-16core-pool" + # specify a zone here (e.g. "-a") to avoid a redundant deployment + location = var.gcp_config.zone_a + cluster = google_container_cluster.primary.name + + # use autoscaling to only create a machine when there is a deployment + autoscaling { + min_node_count = 0 + max_node_count = 1 + } + + node_config { + # use preemptible, as this saves costs + preemptible = true + machine_type = "n1-highcpu-16" + disk_size_gb = 100 + # FIXME: test if SSDs are actually faster than HDDs for our use case + disk_type = "pd-ssd" + guest_accelerator { + type = "nvidia-tesla-t4" + count= 1 + } + + # set the premissions required for the deployment later + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + + # add a label to all machines of this type, so we can select them + # during deployment + labels = { + pool = "nvidia-16core-pool" + } + } +} + + +resource "kubernetes_deployment" "mlir-nvidia" { +# FIXME: move to kubernetes yaml file, as terraform does not support GPU +# resources on GKE. + + metadata { + name = "mlir-nvidia" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "buildbot-mlir-nvidia" + } + } + + template { + metadata { + labels = { + app = "buildbot-mlir-nvidia" + } + } + spec { + container { + name = "mlir-nvidia" + # Specify version number for docker image, this ensures sure you're + # deploying the right version of the image. + image = "${var.gcp_config.gcr_prefix}/buildbot-mlir-nvidia:3" + + resources { + requests { + cpu = 15 + memory = "10Gi" + } + limits { + cpu = 15 + memory = "10Gi" + # FIXME: does not work in terraform + # https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149 + # We probably need to use native Kubernetes for all deployments + # with GPUs until this is implemented. + # nvidia.com/gpu = 1 + } + } + + volume_mount { + mount_path = "/secrets" + name = "buildbot-token" + } + } + volume { + name = "buildbot-token" + secret { + secret_name = "password-mlir-nvidia" + } + } + # Nodes with a GPU are automatically get a "taint". We need to + # "tolerate" this taint, otherwise we can't deploy to that node. + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "present" + effect = "NoSchedule" + } + # select which machines to deploy to, this is using the node pool + # defined above + node_selector = { + pool = "nvidia-16core-pool" + } + } + } + } +} + diff --git a/buildbot/google/terraform/outputs.tf b/buildbot/google/terraform/outputs.tf new file mode 100644 diff --git a/buildbot/google/terraform/terraform.tfvars b/buildbot/google/terraform/terraform.tfvars new file mode 100644 diff --git a/buildbot/google/terraform/variables.tf b/buildbot/google/terraform/variables.tf new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/variables.tf @@ -0,0 +1,15 @@ +# configuration parameter for Google Cloud +variable "gcp_config" { + type = object({ + project = string + region = string + zone_a = string + gcr_prefix = string + }) + default = { + project = "sanitizer-bots" + region = "us-central1" + zone_a = "us-central1-a" + gcr_prefix = "gcr.io/sanitizer-bots" + } +}