Add a New Target

FRIDGE is deployed to a Kubernetes (k8s) cluster that must meet certain requirements

  • Cilium Container network Interface CNI # for network policy enforcement
  • Bring Your Own Key (BYOK) CSI Container Storage Interface

Define a new K8s environment

The targets are defined in an Enum object in infra/fridge/access-cluster/enums/__init__.py. These environments are used in flow control to make target specific changes. Add your target to the Enum like the examples here

from enum import Enum, unique


@unique
class K8sEnvironment(Enum):
    AKS = "AKS"
    DAWN = "Dawn"
    K3S = "K3s"


@unique
class PodSecurityStandard(Enum):
    RESTRICTED = {"pod-security.kubernetes.io/enforce": "restricted"}
    PRIVILEGED = {"pod-security.kubernetes.io/enforce": "privileged"}


@unique
class TlsEnvironment(Enum):
    STAGING = "staging"
    PRODUCTION = "production"
    DEVELOPMENT = "development"


tls_issuer_names = {
    TlsEnvironment.STAGING: "letsencrypt-staging",
    TlsEnvironment.PRODUCTION: "letsencrypt-prod",
    TlsEnvironment.DEVELOPMENT: "dev-issuer",
}

Storage Class

FRIDGE needs storage to support its functions. This storage is presented via a Storage Class. Ideally this needs to support passing a key to encrypt volumes. This depends on the K8s implementations having a CSI that supports this. If your K8s implementation does not have a CSI capable of this, you can instead use Longhorn.

Storage classes used by FRIDGE are defined, for each K8s environment, in infra/fridge/access-cluster/components/storage_classes.py Each target must define,

storage_class
Storage class object for sensitive data. Encrypted with a deployer-provider key or by Longhorn.
standard_storage_name
String giving the name of a storage class for non-sensitive data.
standard_supports_rwm
Boolean indicating whether the storage class named standard_storage_name support ReadWriteMany.
from pulumi import ComponentResource, ResourceOptions
from pulumi_kubernetes.core.v1 import Namespace
from pulumi_kubernetes.meta.v1 import ObjectMetaArgs
from pulumi_kubernetes.helm.v3 import Release
from pulumi_kubernetes.helm.v4 import RepositoryOptsArgs
from pulumi_kubernetes.storage.v1 import StorageClass

from enums import K8sEnvironment, PodSecurityStandard

STORAGE_CLASS_NAME = "fridge"


class StorageClassesArgs:
    def __init__(
        self,
        k8s_environment: K8sEnvironment,
        azure_disk_encryption_set: str | None = None,
        azure_resource_group: str | None = None,
        azure_subscription_id: str | None = None,
    ) -> None:
        self.k8s_environment = k8s_environment
        self.azure_disk_encryption_set = azure_disk_encryption_set
        self.azure_resource_group = azure_resource_group
        self.azure_subscription_id = azure_subscription_id


class StorageClasses(ComponentResource):
    def __init__(
        self, name: str, args: StorageClassesArgs, opts: ResourceOptions | None = None
    ) -> None:
        super().__init__("fridge:StorageClasses", name, None, opts)
        child_opts = ResourceOptions.merge(opts, ResourceOptions(parent=self))

        k8s_environment = args.k8s_environment

        match k8s_environment:
            case K8sEnvironment.AKS:
                storage_class = StorageClass(
                    "fridge_storage_class",
                    allow_volume_expansion=True,
                    metadata=ObjectMetaArgs(
                        name=STORAGE_CLASS_NAME,
                    ),
                    parameters={
                        "diskEncryptionSetID": f"/subscriptions/{args.azure_subscription_id}/resourceGroups/{args.azure_resource_group}/providers/Microsoft.Compute/diskEncryptionSets/{args.azure_disk_encryption_set}",
                        "kind": "managed",
                        "skuname": "StandardSSD_LRS",
                    },
                    provisioner="disk.csi.azure.com",
                    opts=child_opts,
                )

                standard_storage_name = "azurefile"
                standard_supports_rwm = True
            case K8sEnvironment.DAWN:
                longhorn_ns = Namespace(
                    "longhorn-system",
                    metadata=ObjectMetaArgs(
                        name="longhorn-system",
                        labels={} | PodSecurityStandard.PRIVILEGED.value,
                    ),
                    opts=child_opts,
                )

                longhorn = Release(
                    "longhorn",
                    namespace=longhorn_ns.metadata.name,
                    chart="longhorn",
                    version="1.9.0",
                    repository_opts=RepositoryOptsArgs(
                        repo="https://charts.longhorn.io",
                    ),
                    # Add a toleration for the GPU node, to allow Longhorn to schedule pods/create volumes there
                    values={
                        "global": {
                            "tolerations": [
                                {
                                    "key": "gpu.intel.com/i915",
                                    "operator": "Exists",
                                    "effect": "NoSchedule",
                                }
                            ]
                        },
                        "defaultSettings": {
                            "taintToleration": "gpu.intel.com/i915:NoSchedule"
                        },
                        "persistence": {"defaultClassReplicaCount": 2},
                    },
                    opts=ResourceOptions.merge(
                        child_opts,
                        ResourceOptions(depends_on=[longhorn_ns]),
                    ),
                )

                storage_class = StorageClass(
                    "fridge_storage_class",
                    allow_volume_expansion=True,
                    metadata=ObjectMetaArgs(
                        name=STORAGE_CLASS_NAME,
                    ),
                    parameters={
                        "dataLocality": "best-effort",
                        "fsType": "ext4",
                        "numberOfReplicas": "2",
                        "staleReplicaTimeout": "2880",
                    },
                    provisioner="driver.longhorn.io",
                    opts=ResourceOptions.merge(
                        child_opts,
                        ResourceOptions(depends_on=[longhorn]),
                    ),
                )

                standard_storage_name = storage_class.metadata.name
                standard_supports_rwm = True
            case K8sEnvironment.K3S:
                storage_class = StorageClass.get("fridge-storage-class", "local-path")
                standard_storage_name = storage_class.metadata.name
                standard_supports_rwm = False

        self.encrypted_storage_class = storage_class
        self.standard_storage_name = standard_storage_name
        self.standard_supports_rwm = standard_supports_rwm

Network Policies

Some K8s providers might require some tweaks to the Cilium network policies. These are collected, similarly to storage classes in infra/fridge/access-cluster/components/network_policies.py. For example with AKS,

import pulumi
from pulumi import ComponentResource, ResourceOptions
from pulumi_kubernetes.apiextensions import CustomResource
from pulumi_kubernetes.yaml import ConfigFile

from enums import K8sEnvironment


class NetworkPolicies(ComponentResource):
    def __init__(
        self,
        config: pulumi.config.Config,
        name: str,
        k8s_environment: K8sEnvironment,
        opts: ResourceOptions | None = None,
    ) -> None:
        super().__init__("fridge:k8s:NetworkPolicies", name, {}, opts)
        child_opts = ResourceOptions.merge(opts, ResourceOptions(parent=self))

        match k8s_environment:
            case K8sEnvironment.AKS:
                # AKS uses Konnectivity to mediate some API/webhook traffic, and uses a different external DNS server
                ConfigFile(
                    "network_policy_aks",
                    file="./k8s/cilium/aks.yaml",
                    opts=child_opts,
                )
            case K8sEnvironment.DAWN:
                # Dawn uses a different external DNS server to AKS, and also runs regular jobs that do not run on AKS
                ConfigFile(
                    "network_policy_dawn",
                    file="./k8s/cilium/dawn.yaml",
                    opts=child_opts,
                )
                # Add network policy to allow Prometheus monitoring for resources already deployed on Dawn
                # On Dawn, Prometheus is also already deployed
                ConfigFile(
                    "network_policy_prometheus",
                    file="./k8s/cilium/prometheus.yaml",
                    opts=child_opts,
                )
                # Longhorn is used on Dawn for RWX volume provision
                ConfigFile(
                    "network_policy_longhorn",
                    file="./k8s/cilium/longhorn.yaml",
                    opts=child_opts,
                )
            case K8sEnvironment.K3S:
                # K3S policies applicable for a local dev environment
                # These could be used in any vanilla k8s + Cilium local cluster
                ConfigFile(
                    "network_policy_k3s",
                    file="./k8s/cilium/k3s.yaml",
                    opts=child_opts,
                )

        self.api_jumpbox_cnp = CustomResource(
            "network_policy_api_jumpbox",
            api_version="cilium.io/v2",
            kind="CiliumNetworkPolicy",
            metadata={"name": "api-jumpbox-access", "namespace": "api-jumpbox"},
            spec={
                "endpointSelector": {"matchLabels": {"app": "api-jumpbox"}},
                "ingress": [
                    {
                        "fromEndpoints": [
                            {
                                "matchLabels": {
                                    "k8s:app.kubernetes.io/name": "ingress-nginx",
                                    "k8s:app.kubernetes.io/component": "controller",
                                    "k8s:io.kubernetes.pod.namespace": "ingress-nginx",
                                }
                            }
                        ],
                        "toPorts": [{"ports": [{"port": "2222", "protocol": "ANY"}]}],
                    }
                ],
                "egress": [
                    {
                        "toEndpoints": [
                            {
                                "matchLabels": {
                                    "k8s:io.kubernetes.pod.namespace": "kube-system",
                                    "k8s-app": "kube-dns",
                                }
                            }
                        ],
                        "toPorts": [
                            {
                                "ports": [{"port": "53", "protocol": "ANY"}],
                                "rules": {"dns": [{"matchPattern": "*"}]},
                            }
                        ],
                    },
                    {
                        "toEndpoints": [
                            {
                                "matchLabels": {
                                    "k8s:app.kubernetes.io/name": "ingress-nginx",
                                    "k8s:app.kubernetes.io/component": "controller",
                                    "k8s:io.kubernetes.pod.namespace": "ingress-nginx",
                                }
                            }
                        ],
                        "toPorts": [{"ports": [{"port": "2222", "protocol": "TCP"}]}],
                    },
                    {
                        "toCIDR": [config.require("fridge_api_ip_address")],
                        "toPorts": [{"ports": [{"port": "443", "protocol": "TCP"}]}],
                    },
                    {
                        "toFQDNs": [
                            {
                                "matchName": config.require(
                                    "isolated_cluster_api_endpoint"
                                )
                            }
                        ],
                        "toPorts": [{"ports": [{"port": "443", "protocol": "ANY"}]}],
                    },
                ],
            },
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_cert_manager",
            file="./k8s/cilium/cert_manager.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_containerd_config",
            file="./k8s/cilium/containerd_config.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_harbor",
            file="./k8s/cilium/harbor.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_hubble",
            file="./k8s/cilium/hubble.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_ingress_nginx",
            file="./k8s/cilium/ingress-nginx.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_kube_node_lease",
            file="./k8s/cilium/kube-node-lease.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_kube_public",
            file="./k8s/cilium/kube-public.yaml",
            opts=child_opts,
        )

        ConfigFile(
            "network_policy_kubernetes_system",
            file="./k8s/cilium/kube-system.yaml",
            opts=child_opts,
        )

Here the policy manifests are defined in ./access-cluster/k8s/cilium/aks.yaml.

Service Changes

You may also need to deploy extra services, or you may want to avoid replacing services which are already deployed. This may be most convenient to do in infra/fridge/access-cluster/__main__.py.

For example, the Hubble interface for Cilium is not provisioned automatically on AKS, so it is deployed here,

import pulumi

from pulumi import ResourceOptions
from pulumi_kubernetes.batch.v1 import CronJobPatch, CronJobSpecPatchArgs
from pulumi_kubernetes.core.v1 import NamespacePatch
from pulumi_kubernetes.meta.v1 import ObjectMetaPatchArgs
from pulumi_kubernetes.yaml import ConfigFile

import components
from enums import K8sEnvironment, PodSecurityStandard, TlsEnvironment


def patch_namespace(name: str, pss: PodSecurityStandard) -> NamespacePatch:
    """
    Apply a PodSecurityStandard label to a namespace
    """
    return NamespacePatch(
        f"{name}-ns-pod-security",
        metadata=ObjectMetaPatchArgs(name=name, labels={} | pss.value),
    )


config = pulumi.Config()
tls_environment = TlsEnvironment(config.require("tls_environment"))
stack_name = pulumi.get_stack()

try:
    k8s_environment = K8sEnvironment(config.get("k8s_env"))
except ValueError:
    raise ValueError(
        f"Invalid k8s environment: {config.get('k8s_env')}. "
        f"Supported values are {', '.join([item.value for item in K8sEnvironment])}."
    )

# Hubble UI
# Interface for Cilium
if k8s_environment == K8sEnvironment.AKS:
    hubble_ui = ConfigFile(
        "hubble-ui",
        file="./k8s/hubble/hubble_ui.yaml",
    )

# Private API proxy
api_ssh_jumpbox = components.FridgeAPIJumpbox(
    "fridge-api-ssh-jumpbox",
    components.FridgeAPIJumpboxArgs(
        config=config,
        k8s_environment=k8s_environment,
    ),
)

ingress_nginx = components.Ingress(
    "ingress-nginx",
    args=components.IngressArgs(
        api_jumpbox=api_ssh_jumpbox, k8s_environment=k8s_environment
    ),
)

cert_manager = components.CertManager(
    "cert-manager",
    args=components.CertManagerArgs(
        config=config,
        k8s_environment=k8s_environment,
        tls_environment=tls_environment,
    ),
)

# Storage classes
storage_classes = components.StorageClasses(
    "storage_classes",
    components.StorageClassesArgs(
        k8s_environment=k8s_environment,
        azure_disk_encryption_set=(
            config.require("azure_disk_encryption_set")
            if k8s_environment is K8sEnvironment.AKS
            else None
        ),
        azure_resource_group=(
            config.require("azure_resource_group")
            if k8s_environment is K8sEnvironment.AKS
            else None
        ),
        azure_subscription_id=(
            config.require("azure_subscription_id")
            if k8s_environment is K8sEnvironment.AKS
            else None
        ),
    ),
)

# Use patches for standard namespaces rather then trying to create them, so Pulumi does not try to delete them on teardown
standard_namespaces = ["default", "kube-node-lease", "kube-public"]
for namespace in standard_namespaces:
    patch_namespace(namespace, PodSecurityStandard.RESTRICTED)

# Harbor
harbor = components.ContainerRegistry(
    "harbor",
    components.ContainerRegistryArgs(
        config=config,
        tls_environment=tls_environment,
        storage_classes=storage_classes,
    ),
    opts=ResourceOptions(
        depends_on=[ingress_nginx, cert_manager, storage_classes],
    ),
)

# Network policy (through Cilium)
# Network policies should be deployed last to ensure that none of them interfere with the deployment process
resources = [
    cert_manager,
    harbor.configure_containerd_daemonset,
    harbor,
    ingress_nginx,
    storage_classes,
]

network_policies = components.NetworkPolicies(
    name=f"{stack_name}-network-policies",
    config=config,
    k8s_environment=k8s_environment,
    opts=ResourceOptions(
        depends_on=resources,
    ),
)

# Pulumi exports
pulumi.export("fridge_api_ip_address", config.require("fridge_api_ip_address"))
pulumi.export("harbor_fqdn", harbor.harbor_fqdn)
pulumi.export("harbor_ip_address", config.require("harbor_ip"))
pulumi.export("ingress_ip", ingress_nginx.ingress_ip)
pulumi.export("ingress_ports", ingress_nginx.ingress_ports)