diff --git a/buildbot/google/README.md b/buildbot/google/README.md --- a/buildbot/google/README.md +++ b/buildbot/google/README.md @@ -3,3 +3,40 @@ This folder contains some of the configuration of the buildbots managed at Google. The workers are deployed on Google Cloud. + +# The cloud stack + +To deploy build bots workers, we need to create a bunch of virtual machines +on Google Cloud. There are multiple ways to do this. *Terraform* is convenient +as it offers to declare the required machines in config files and then +create/update the machines in the cloud. + +This way we have version control over the infrastructure +and we can review changes before applying them. In case something goes wrong, +we can easily revert changes. It also allows us to copy & paste parts of the +infrastructure for additional machines. + +Internally, Terraform is using *Kubernetes* to manage the deployment of software +to machines. The software installed on the build machines is defined +in *Docker* images. An image is a (layered) file system with all the tools and +settings required for the worker. + +The images are stored in a "regsitry" (gcr.io in this case) and are then +pulled from the machines where they are executed. The +images can be versioned so that we can pick exactly which version of the image +we want to run. + +The contets of a Docker image is again defined in a config file called +`Dockerfile`. A Dockerfile is a sort of script defining on how to install and +configure the software for a machine. We keep those files in this repositry as +well so we can review changes and revert changes if something breaks. + +The docker images also allow contributors to reproduce a failing test locally, +as they will get the same machine configuration as used on the server. + +# Folder structure + +* `docker` - Dockerfiles for the workers and some scripting +* `terraform` - cluster configuration and deployment +* `config.sh` - variables used in other scripts +* `gcloud_config.sh` - configure cloud tooling diff --git a/buildbot/google/config.sh b/buildbot/google/config.sh new file mode 100644 --- /dev/null +++ b/buildbot/google/config.sh @@ -0,0 +1,5 @@ +# config parameters for the Google Cloud, this is used by other scripts +GCP_PROJECT="sanitizer-bots" +GCP_ZONE="us-central1-a" +GCP_CLUSTER="buildbot-cluster" +GCR_PREFIX="gcr.io/${GCP_PROJECT}" diff --git a/buildbot/google/gcloud_config.sh b/buildbot/google/gcloud_config.sh new file mode 100755 --- /dev/null +++ b/buildbot/google/gcloud_config.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eux + +ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# load project configuration +source "${ROOT_DIR}/config.sh" + +gcloud config set project ${GCP_PROJECT} +gcloud config set compute/zone ${GCP_ZONE} +gcloud auth configure-docker +gcloud container clusters get-credentials $GCP_CLUSTER diff --git a/buildbot/google/terraform/README.md b/buildbot/google/terraform/README.md new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/README.md @@ -0,0 +1,54 @@ +This folder contains the Terraform configuration to spawn the build bots. + +Before deploying anything new, use `terraform plan` to check that you're only +modifying the parts that you intended to. + + +# Installation + +To set up your local machine to deploy changes to the cluster follow these +steps: + +1. Install these tools: + 1. [Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) + 1. [Google Cloud SDK](https://cloud.google.com/sdk/install) + 1. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +1. Run `llvm-zorg/buildbot/google/gcloud_config.sh` to configure the Google + Cloud SDK. +1. To configure the GCP credetianls for terraform run: + ```bash + export GOOGLE_CREDENTIALS=~/.config/gcloud/legacy_credentials//adc.json + ``` + +# Deploying to new Google Cloud project + +When deploying this cluster to a completely new Google Cloud project, these +manual steps are required: + +* You need to create the GCP project manually before Terraform works. +* You also need to go to the Kubernetes page once, to enable Kubernetes and + Container Registry for that project. +* GPUs need to be enabled on Kubernetes by following these +[instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers). + + +# Secrets + +To keep secrets a secret, they MUST not be stored in version control. The right +place on kubernetes is a "secret". To create a kubernetes secret for the agent +token: +```bash +kubectl create secret generic buildbot-token-mlir-nvidia --from-file=token= +``` +The file in `` then must contain the password of the buildbot worker +in plain text. In the `Deployment` of a container, the secret is defined as a +special type of volume and mounted in the specified path. During runtime the +secret can then be read from that file. + +An example: +The secret `buildbot-token-mlir-nvidia` is defined (as above) in Kubernetes. +In the [deployment](buildbot/google/terraform/main.tf) `mlir-nvidia` it is +used as a volume of type `secret` and then mounted at `/secrets`. During the +runtime of the docker container, the script +[run.sh](../docker/buildbot-mlir-nvidia/run.sh) reads the secret from the file +`/secrets/token` and uses it to create the worker configuration. \ No newline at end of file diff --git a/buildbot/google/terraform/main.tf b/buildbot/google/terraform/main.tf new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/main.tf @@ -0,0 +1,144 @@ + +# configure Google Cloud project +provider "google" { + project = var.gcp_config.project + region = var.gcp_config.region +} + +# Create the cluster runningn all Kubernetes services +resource "google_container_cluster" "primary" { + name = "buildbot-cluster" + # maybe have a regional cluster for Kubernetes, as we depend on this... + location = var.gcp_config.zone_a + + # one node is enough (at the moment) + initial_node_count = 1 + + node_config { + # FIXME(kuhnel): turn this into a private cluster, without external IP + # We need at least 2 vCPU to run all kubernetes services + machine_type = "n1-standard-2" + # use preemptible, as this saves costs + preemptible = true + } +} + +# Create machines for mlir-nvidia +resource "google_container_node_pool" "nvidia_16core_pool_nodes" { + name = "nvidia-16core-pool" + # specify a zone here (e.g. "-a") to avoid a redundant deployment + location = var.gcp_config.zone_a + cluster = google_container_cluster.primary.name + + # use autoscaling to only create a machine when there is a deployment + autoscaling { + min_node_count = 0 + max_node_count = 1 + } + + node_config { + # use preemptible, as this saves costs + preemptible = true + machine_type = "n1-highcpu-16" + disk_size_gb = 100 + # FIXME: test if SSDs are actually faster than HDDs for our use case + disk_type = "pd-ssd" + guest_accelerator { + type = "nvidia-tesla-t4" + count= 1 + } + + # set the premissions required for the deployment later + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + + # add a label to all machines of this type, so we can select them + # during deployment + labels = { + pool = "nvidia-16core-pool" + } + } +} + + +resource "kubernetes_deployment" "mlir-nvidia" { +# FIXME: move to kubernetes yaml file, as terraform does not support GPU +# resources on GKE. + + metadata { + name = "mlir-nvidia" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "buildbot-mlir-nvidia" + } + } + + template { + metadata { + labels = { + app = "buildbot-mlir-nvidia" + } + } + spec { + container { + name = "mlir-nvidia" + # Specify version number for docker image, this ensures sure you're + # deploying the right version of the image. + image = "${var.gcp_config.gcr_prefix}/buildbot-mlir-nvidia:3" + + resources { + requests { + cpu = 15 + memory = "10Gi" + } + limits { + cpu = 15 + memory = "10Gi" + # FIXME: does not work in terraform + # https://github.com/terraform-providers/terraform-provider-kubernetes/issues/149 + # We probably need to use native Kubernetes for all deployments + # with GPUs until this is implemented. + # nvidia.com/gpu = 1 + } + } + + volume_mount { + mount_path = "/secrets" + name = "buildbot-token" + } + } + volume { + name = "buildbot-token" + secret { + secret_name = "buildbot-token-mlir-nvidia" + } + } + # Nodes with a GPU are automatically get a "taint". We need to + # "tolerate" this taint, otherwise we can't deploy to that node. + # This is a safe guard to only deploy container that require GPUs + # to machines with GPUs. More details: + # * https://cloud.google.com/kubernetes-engine/docs/how-to/gpus + # * https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "present" + effect = "NoSchedule" + } + # select which machines to deploy to, this is using the node pool + # defined above + node_selector = { + pool = "nvidia-16core-pool" + } + } + } + } +} diff --git a/buildbot/google/terraform/outputs.tf b/buildbot/google/terraform/outputs.tf new file mode 100644 diff --git a/buildbot/google/terraform/terraform.tfvars b/buildbot/google/terraform/terraform.tfvars new file mode 100644 diff --git a/buildbot/google/terraform/variables.tf b/buildbot/google/terraform/variables.tf new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/variables.tf @@ -0,0 +1,15 @@ +# configuration parameter for Google Cloud +variable "gcp_config" { + type = object({ + project = string + region = string + zone_a = string + gcr_prefix = string + }) + default = { + project = "sanitizer-bots" + region = "us-central1" + zone_a = "us-central1-a" + gcr_prefix = "gcr.io/sanitizer-bots" + } +}