Add a New Target
FRIDGE is deployed to a Kubernetes (k8s) cluster that must meet certain requirements
- Cilium Container network Interface CNI # for network policy enforcement
- Bring Your Own Key (BYOK) CSI Container Storage Interface
Define a new K8s environment
The targets are defined in an Enum object in infra/fridge/access-cluster/enums/__init__.py.
These environments are used in flow control to make target specific changes.
Add your target to the Enum like the examples here
from enum import Enum, unique
@unique
class K8sEnvironment(Enum):
AKS = "AKS"
DAWN = "Dawn"
K3S = "K3s"
@unique
class PodSecurityStandard(Enum):
RESTRICTED = {"pod-security.kubernetes.io/enforce": "restricted"}
PRIVILEGED = {"pod-security.kubernetes.io/enforce": "privileged"}
@unique
class TlsEnvironment(Enum):
STAGING = "staging"
PRODUCTION = "production"
DEVELOPMENT = "development"
tls_issuer_names = {
TlsEnvironment.STAGING: "letsencrypt-staging",
TlsEnvironment.PRODUCTION: "letsencrypt-prod",
TlsEnvironment.DEVELOPMENT: "dev-issuer",
}
Storage Class
FRIDGE needs storage to support its functions. This storage is presented via a Storage Class. Ideally this needs to support passing a key to encrypt volumes. This depends on the K8s implementations having a CSI that supports this. If your K8s implementation does not have a CSI capable of this, you can instead use Longhorn.
Storage classes used by FRIDGE are defined, for each K8s environment, in infra/fridge/access-cluster/components/storage_classes.py
Each target must define,
storage_class- Storage class object for sensitive data. Encrypted with a deployer-provider key or by Longhorn.
standard_storage_name- String giving the name of a storage class for non-sensitive data.
standard_supports_rwm- Boolean indicating whether the storage class named
standard_storage_namesupportReadWriteMany.
from pulumi import ComponentResource, ResourceOptions
from pulumi_kubernetes.core.v1 import Namespace
from pulumi_kubernetes.meta.v1 import ObjectMetaArgs
from pulumi_kubernetes.helm.v3 import Release
from pulumi_kubernetes.helm.v4 import RepositoryOptsArgs
from pulumi_kubernetes.storage.v1 import StorageClass
from enums import K8sEnvironment, PodSecurityStandard
STORAGE_CLASS_NAME = "fridge"
class StorageClassesArgs:
def __init__(
self,
k8s_environment: K8sEnvironment,
azure_disk_encryption_set: str | None = None,
azure_resource_group: str | None = None,
azure_subscription_id: str | None = None,
) -> None:
self.k8s_environment = k8s_environment
self.azure_disk_encryption_set = azure_disk_encryption_set
self.azure_resource_group = azure_resource_group
self.azure_subscription_id = azure_subscription_id
class StorageClasses(ComponentResource):
def __init__(
self, name: str, args: StorageClassesArgs, opts: ResourceOptions | None = None
) -> None:
super().__init__("fridge:StorageClasses", name, None, opts)
child_opts = ResourceOptions.merge(opts, ResourceOptions(parent=self))
k8s_environment = args.k8s_environment
match k8s_environment:
case K8sEnvironment.AKS:
storage_class = StorageClass(
"fridge_storage_class",
allow_volume_expansion=True,
metadata=ObjectMetaArgs(
name=STORAGE_CLASS_NAME,
),
parameters={
"diskEncryptionSetID": f"/subscriptions/{args.azure_subscription_id}/resourceGroups/{args.azure_resource_group}/providers/Microsoft.Compute/diskEncryptionSets/{args.azure_disk_encryption_set}",
"kind": "managed",
"skuname": "StandardSSD_LRS",
},
provisioner="disk.csi.azure.com",
opts=child_opts,
)
standard_storage_name = "azurefile"
standard_supports_rwm = True
case K8sEnvironment.DAWN:
longhorn_ns = Namespace(
"longhorn-system",
metadata=ObjectMetaArgs(
name="longhorn-system",
labels={} | PodSecurityStandard.PRIVILEGED.value,
),
opts=child_opts,
)
longhorn = Release(
"longhorn",
namespace=longhorn_ns.metadata.name,
chart="longhorn",
version="1.9.0",
repository_opts=RepositoryOptsArgs(
repo="https://charts.longhorn.io",
),
# Add a toleration for the GPU node, to allow Longhorn to schedule pods/create volumes there
values={
"global": {
"tolerations": [
{
"key": "gpu.intel.com/i915",
"operator": "Exists",
"effect": "NoSchedule",
}
]
},
"defaultSettings": {
"taintToleration": "gpu.intel.com/i915:NoSchedule"
},
"persistence": {"defaultClassReplicaCount": 2},
},
opts=ResourceOptions.merge(
child_opts,
ResourceOptions(depends_on=[longhorn_ns]),
),
)
storage_class = StorageClass(
"fridge_storage_class",
allow_volume_expansion=True,
metadata=ObjectMetaArgs(
name=STORAGE_CLASS_NAME,
),
parameters={
"dataLocality": "best-effort",
"fsType": "ext4",
"numberOfReplicas": "2",
"staleReplicaTimeout": "2880",
},
provisioner="driver.longhorn.io",
opts=ResourceOptions.merge(
child_opts,
ResourceOptions(depends_on=[longhorn]),
),
)
standard_storage_name = storage_class.metadata.name
standard_supports_rwm = True
case K8sEnvironment.K3S:
storage_class = StorageClass.get("fridge-storage-class", "local-path")
standard_storage_name = storage_class.metadata.name
standard_supports_rwm = False
self.encrypted_storage_class = storage_class
self.standard_storage_name = standard_storage_name
self.standard_supports_rwm = standard_supports_rwm
Network Policies
Some K8s providers might require some tweaks to the Cilium network policies.
These are collected, similarly to storage classes in infra/fridge/access-cluster/components/network_policies.py.
For example with AKS,
import pulumi
from pulumi import ComponentResource, ResourceOptions
from pulumi_kubernetes.apiextensions import CustomResource
from pulumi_kubernetes.yaml import ConfigFile
from enums import K8sEnvironment
class NetworkPolicies(ComponentResource):
def __init__(
self,
config: pulumi.config.Config,
name: str,
k8s_environment: K8sEnvironment,
opts: ResourceOptions | None = None,
) -> None:
super().__init__("fridge:k8s:NetworkPolicies", name, {}, opts)
child_opts = ResourceOptions.merge(opts, ResourceOptions(parent=self))
match k8s_environment:
case K8sEnvironment.AKS:
# AKS uses Konnectivity to mediate some API/webhook traffic, and uses a different external DNS server
ConfigFile(
"network_policy_aks",
file="./k8s/cilium/aks.yaml",
opts=child_opts,
)
case K8sEnvironment.DAWN:
# Dawn uses a different external DNS server to AKS, and also runs regular jobs that do not run on AKS
ConfigFile(
"network_policy_dawn",
file="./k8s/cilium/dawn.yaml",
opts=child_opts,
)
# Add network policy to allow Prometheus monitoring for resources already deployed on Dawn
# On Dawn, Prometheus is also already deployed
ConfigFile(
"network_policy_prometheus",
file="./k8s/cilium/prometheus.yaml",
opts=child_opts,
)
# Longhorn is used on Dawn for RWX volume provision
ConfigFile(
"network_policy_longhorn",
file="./k8s/cilium/longhorn.yaml",
opts=child_opts,
)
case K8sEnvironment.K3S:
# K3S policies applicable for a local dev environment
# These could be used in any vanilla k8s + Cilium local cluster
ConfigFile(
"network_policy_k3s",
file="./k8s/cilium/k3s.yaml",
opts=child_opts,
)
self.api_jumpbox_cnp = CustomResource(
"network_policy_api_jumpbox",
api_version="cilium.io/v2",
kind="CiliumNetworkPolicy",
metadata={"name": "api-jumpbox-access", "namespace": "api-jumpbox"},
spec={
"endpointSelector": {"matchLabels": {"app": "api-jumpbox"}},
"ingress": [
{
"fromEndpoints": [
{
"matchLabels": {
"k8s:app.kubernetes.io/name": "ingress-nginx",
"k8s:app.kubernetes.io/component": "controller",
"k8s:io.kubernetes.pod.namespace": "ingress-nginx",
}
}
],
"toPorts": [{"ports": [{"port": "2222", "protocol": "ANY"}]}],
}
],
"egress": [
{
"toEndpoints": [
{
"matchLabels": {
"k8s:io.kubernetes.pod.namespace": "kube-system",
"k8s-app": "kube-dns",
}
}
],
"toPorts": [
{
"ports": [{"port": "53", "protocol": "ANY"}],
"rules": {"dns": [{"matchPattern": "*"}]},
}
],
},
{
"toEndpoints": [
{
"matchLabels": {
"k8s:app.kubernetes.io/name": "ingress-nginx",
"k8s:app.kubernetes.io/component": "controller",
"k8s:io.kubernetes.pod.namespace": "ingress-nginx",
}
}
],
"toPorts": [{"ports": [{"port": "2222", "protocol": "TCP"}]}],
},
{
"toCIDR": [config.require("fridge_api_ip_address")],
"toPorts": [{"ports": [{"port": "443", "protocol": "TCP"}]}],
},
{
"toFQDNs": [
{
"matchName": config.require(
"isolated_cluster_api_endpoint"
)
}
],
"toPorts": [{"ports": [{"port": "443", "protocol": "ANY"}]}],
},
],
},
opts=child_opts,
)
ConfigFile(
"network_policy_cert_manager",
file="./k8s/cilium/cert_manager.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_containerd_config",
file="./k8s/cilium/containerd_config.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_harbor",
file="./k8s/cilium/harbor.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_hubble",
file="./k8s/cilium/hubble.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_ingress_nginx",
file="./k8s/cilium/ingress-nginx.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_kube_node_lease",
file="./k8s/cilium/kube-node-lease.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_kube_public",
file="./k8s/cilium/kube-public.yaml",
opts=child_opts,
)
ConfigFile(
"network_policy_kubernetes_system",
file="./k8s/cilium/kube-system.yaml",
opts=child_opts,
)
Here the policy manifests are defined in ./access-cluster/k8s/cilium/aks.yaml.
Service Changes
You may also need to deploy extra services, or you may want to avoid replacing services which are already deployed.
This may be most convenient to do in infra/fridge/access-cluster/__main__.py.
For example, the Hubble interface for Cilium is not provisioned automatically on AKS, so it is deployed here,
import pulumi
from pulumi import ResourceOptions
from pulumi_kubernetes.batch.v1 import CronJobPatch, CronJobSpecPatchArgs
from pulumi_kubernetes.core.v1 import NamespacePatch
from pulumi_kubernetes.meta.v1 import ObjectMetaPatchArgs
from pulumi_kubernetes.yaml import ConfigFile
import components
from enums import K8sEnvironment, PodSecurityStandard, TlsEnvironment
def patch_namespace(name: str, pss: PodSecurityStandard) -> NamespacePatch:
"""
Apply a PodSecurityStandard label to a namespace
"""
return NamespacePatch(
f"{name}-ns-pod-security",
metadata=ObjectMetaPatchArgs(name=name, labels={} | pss.value),
)
config = pulumi.Config()
tls_environment = TlsEnvironment(config.require("tls_environment"))
stack_name = pulumi.get_stack()
try:
k8s_environment = K8sEnvironment(config.get("k8s_env"))
except ValueError:
raise ValueError(
f"Invalid k8s environment: {config.get('k8s_env')}. "
f"Supported values are {', '.join([item.value for item in K8sEnvironment])}."
)
# Hubble UI
# Interface for Cilium
if k8s_environment == K8sEnvironment.AKS:
hubble_ui = ConfigFile(
"hubble-ui",
file="./k8s/hubble/hubble_ui.yaml",
)
# Private API proxy
api_ssh_jumpbox = components.FridgeAPIJumpbox(
"fridge-api-ssh-jumpbox",
components.FridgeAPIJumpboxArgs(
config=config,
k8s_environment=k8s_environment,
),
)
ingress_nginx = components.Ingress(
"ingress-nginx",
args=components.IngressArgs(
api_jumpbox=api_ssh_jumpbox, k8s_environment=k8s_environment
),
)
cert_manager = components.CertManager(
"cert-manager",
args=components.CertManagerArgs(
config=config,
k8s_environment=k8s_environment,
tls_environment=tls_environment,
),
)
# Storage classes
storage_classes = components.StorageClasses(
"storage_classes",
components.StorageClassesArgs(
k8s_environment=k8s_environment,
azure_disk_encryption_set=(
config.require("azure_disk_encryption_set")
if k8s_environment is K8sEnvironment.AKS
else None
),
azure_resource_group=(
config.require("azure_resource_group")
if k8s_environment is K8sEnvironment.AKS
else None
),
azure_subscription_id=(
config.require("azure_subscription_id")
if k8s_environment is K8sEnvironment.AKS
else None
),
),
)
# Use patches for standard namespaces rather then trying to create them, so Pulumi does not try to delete them on teardown
standard_namespaces = ["default", "kube-node-lease", "kube-public"]
for namespace in standard_namespaces:
patch_namespace(namespace, PodSecurityStandard.RESTRICTED)
# Harbor
harbor = components.ContainerRegistry(
"harbor",
components.ContainerRegistryArgs(
config=config,
tls_environment=tls_environment,
storage_classes=storage_classes,
),
opts=ResourceOptions(
depends_on=[ingress_nginx, cert_manager, storage_classes],
),
)
# Network policy (through Cilium)
# Network policies should be deployed last to ensure that none of them interfere with the deployment process
resources = [
cert_manager,
harbor.configure_containerd_daemonset,
harbor,
ingress_nginx,
storage_classes,
]
network_policies = components.NetworkPolicies(
name=f"{stack_name}-network-policies",
config=config,
k8s_environment=k8s_environment,
opts=ResourceOptions(
depends_on=resources,
),
)
# Pulumi exports
pulumi.export("fridge_api_ip_address", config.require("fridge_api_ip_address"))
pulumi.export("harbor_fqdn", harbor.harbor_fqdn)
pulumi.export("harbor_ip_address", config.require("harbor_ip"))
pulumi.export("ingress_ip", ingress_nginx.ingress_ip)
pulumi.export("ingress_ports", ingress_nginx.ingress_ports)