diff --git a/buildbot/google/config.sh b/buildbot/google/config.sh old mode 100644 new mode 100755 diff --git a/buildbot/google/terraform/.gitignore b/buildbot/google/terraform/.gitignore new file mode 100644 --- /dev/null +++ b/buildbot/google/terraform/.gitignore @@ -0,0 +1,3 @@ +.terraform/ +.terraform.tfstate.lock.info +terraform.tfstate.backup diff --git a/buildbot/google/terraform/README.md b/buildbot/google/terraform/README.md --- a/buildbot/google/terraform/README.md +++ b/buildbot/google/terraform/README.md @@ -17,8 +17,11 @@ Cloud SDK. 1. To configure the GCP credetianls for terraform run: ```bash - export GOOGLE_CREDENTIALS=~/.config/gcloud/legacy_credentials//adc.json + export GOOGLE_APPLICATION_CREDENTIALS=~/.config/gcloud/legacy_credentials//adc.json ``` + Note: Some documentation recommends `GOOGLE_CREDENTIALS` for this, however + this does not work for accessing the Google Cloud Storage backend in + terraform. # Deploying to new Google Cloud project @@ -30,6 +33,15 @@ Container Registry for that project. * GPUs need to be enabled on Kubernetes by following these [instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers). +* Terraform needs to share a "state" between all users. The "backend" for this + can be a "bucket" on Google Cloud Storage. So you need to create that bucket + and give all users write access. In addition, you should enable "object + versioning" to be able to access previous versions of the state in case it + gets corrupted: + ```bash + gsutil versioning set on gs:// + ```` +* Store the secrets (see next section). # Secrets diff --git a/buildbot/google/terraform/main.tf b/buildbot/google/terraform/main.tf --- a/buildbot/google/terraform/main.tf +++ b/buildbot/google/terraform/main.tf @@ -1,3 +1,14 @@ +# configure cloud storage backend to keep state information. This is shared +# across all users and contains the previously configured parts. Accessing +# GCS requires that the environment variable `GOOGLE_CLOUD_KEYFILE_JSON` points +# to your credential file, e.g. +# ~/.config/gcloud/legacy_credentials//adc.json +terraform { + backend "gcs" { + bucket = "buildbot_cluster_terraform_backend" + prefix = "terraform/state" + } +} # configure Google Cloud project provider "google" { @@ -5,11 +16,26 @@ region = var.gcp_config.region } + +# create a network for the cluster, required for Kubernetes on Windows +# FIXME: rename to "buildbot-vpc-network", causes destruction of the cluster! +resource "google_compute_network" "vpc_network" { + name = "vpc-network" +} + # Create the cluster runningn all Kubernetes services resource "google_container_cluster" "primary" { name = "buildbot-cluster" # maybe have a regional cluster for Kubernetes, as we depend on this... location = var.gcp_config.zone_a + + # configure local network, required for Kubernetes on Windows + network = google_compute_network.vpc_network.name + # enable alias IP addresses, required for Kubernetes for Windows + ip_allocation_policy {} + + # use newer Kubernetes version, otherwise Windows node pools can't be created + min_master_version = "1.16" # one node is enough (at the moment) initial_node_count = 1 @@ -21,6 +47,7 @@ # use preemptible, as this saves costs preemptible = true } + } # Create machines for mlir-nvidia @@ -65,3 +92,140 @@ } } } + +# node pool for windows machines +resource "google_container_node_pool" "windows_16core_pool_nodes" { + name = "windows-16core-pool" + # specify a zone here (e.g. "-a") to avoid a redundant deployment + location = var.gcp_config.zone_a + cluster = google_container_cluster.primary.name + + # use autoscaling to only create a machine when there is a deployment + autoscaling { + min_node_count = 0 + max_node_count = 1 + } + + node_config { + # use preemptible, as this saves costs + preemptible = true + machine_type = "n1-highcpu-16" + # Windows deployments tend to require more disk space, so using 200GB here. + disk_size_gb = 200 + # FIXME: test if SSDs are actually faster than HDDs for our use case + disk_type = "pd-ssd" + + # Configure Windows image. As Windows is picky about the combination of + # host and container OS versions, this must be compatible with the version + # in your container. Recommondation: Use LTSC for long-term stability. + # For details see + # https://docs.microsoft.com/en-us/virtualization/windowscontainers/deploy-containers/version-compatibility + # https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster-windows#choose_your_windows_server_node_image + image_type = "WINDOWS_LTSC" + + # set the premissions required for the deployment later + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + + # add a label to all machines of this type, so we can select them + # during deployment + labels = { + pool = "win-16core-pool" + } + } +} + +# Deployment for the buildbot windows10_vs2019 running on Windows rather than +# Linux. +# Note: Deploying this takes significantly longer (~35 min) than on Linux +# as the images tend to be larger (~18GB) and IO performance is lower. +resource "kubernetes_deployment" "windows10_vs2019" { + metadata { + name = "windows10-vs2019" + labels = { + app = "windows10_vs2019" + } + } + + spec { + # create one instance of this container + replicas = 1 + + selector { + match_labels = { + app = "windows10_vs2019" + } + } + strategy{ + rolling_update{ + # do not deploy more replicas, as the buildbot server + # can't handle multiple workers with the same credentials + max_surge = 0 + # Allow to have 0 replicas during updates. + max_unavailable = 1 + } + type = "RollingUpdate" + } + template { + metadata { + labels = { + app = "windows10_vs2019" + } + } + + spec { + container { + image = "${var.gcp_config.gcr_prefix}/buildbot-windows10-vs2019:3" + name = "windows10-vs2019" + + # reserve "-1" for this image, kubernetes also + # needs <1 core for management tools + resources { + limits { + cpu = "15" + memory = "10Gi" + } + requests { + cpu = "15" + memory = "10Gi" + } + } + + # mount the secrets into a folder + volume_mount { + mount_path = "c:\\secrets" + name = "buildbot-token" + } + } + # select which node pool to deploy to + node_selector = { + pool = "win-16core-pool" + } + # restart in case of any crashes + restart_policy = "Always" + + # select the secret to be mounted + volume { + name = "buildbot-token" + secret { + optional = false + secret_name = "password-windows10-vs2019" + } + + } + # Windows nodes from the node pool are marked with the taint + # "node.kubernetes.io/os=windows". So we need to "tolerate" this to + # deploy to such nodes. + toleration { + effect = "NoSchedule" + key = "node.kubernetes.io/os" + operator = "Equal" + value = "windows" + } + } + } + } +}