Pre-upgrade-script

To improve the experience during your upcoming upgrades, we recommend running our pre-check script before initiating the process. This proactive step is designed to identify and address potential blockers that may arise during the upgrade

The bash script has following prerequesites

  • kubectl binaries and access
  • jq binaries
#!/bin/bash

#
# This Pre-upgrade check script can be used by Swisscom CaaS customers
# before upgrading Kubernetes clusters on CaaS to see if the cluster
# has some common conditions which can lead to issues during the upgrade.
#

# Set to yes to print additional debug messages
debug=no

function warn() {
  echo ""
  echo "Warning! $*"
}
function die() {
  echo "Error! $*"
  echo "Exiting."
  exit 1
}
function debug() {
  [[ $debug == "yes" ]] && echo "Debug: $*"
}

echo "This script will check the k8s cluster before upgrade for some common conditions"
echo "which often require operator attention before proceeding with the upgrade."
echo ""
echo "If you see warnings below, it is likely that an issue is possible during the upgrade."
echo "Please take your time to understand exactly what is happening, and take appropriate action."
echo ""
echo "This script checks for some of the most common conditions experiences by Swisscom customers."
echo "It is not an exhaustive list. The operator should excercise their judgement before"
echo "proceeding with the upgrade."
echo ""

# Check pre-requisites
[[ ! -x $(which kubectl) ]] && die "This script requires kubectl to query your cluster."
[[ ! -x $(which jq) ]] && die "This script requires jq. Please install before proceeding: https://jqlang.github.io/jq/download/"

# Check kube-api connection and list namespaces
set -o pipefail
context=$(kubectl config current-context)
[[ "$context" == "" || $? != 0 ]] && die "The script is unable to find a kubectl context to connect to."
echo "==== Connecting to k8s cluster context $context..."
# Get the server version using kubectl and extract the GitVersion part
server_version=$(kubectl version --short | grep Server | awk '{print $3}')
major_version=$(echo $server_version | cut -d. -f1 | sed 's/v//')
minor_version=$(echo $server_version | cut -d. -f2)
echo "Detected Kubernetes cluster version: $server_version"
# Get namespaces
namespaces=$(kubectl get ns -o json | jq -r ".items[].metadata.name")
[[ "$namespaces" == "" || $? != 0 ]] && die "The script is unable to find any k8s namespaces."
echo "Retreived a list of cluster namespaces:" $namespaces

# Check for PDB config requiring manual intervention during upgrades
echo ""
echo "==== Checking for Pod Disruption Budget config requiring manual intervention during upgrades..."

for namespace in $namespaces ; do
  debug "Namespace: $namespace"
  pdbs=$(kubectl get poddisruptionbudgets -n $namespace -o json | jq -r ".items[].metadata.name")
  [[ "$pdbs" == "" || $? != 0 ]] && echo "No PDBs found in namespace '$namespace'"
  debug "PDBs:" $pdbs

  for pdb in $pdbs ; do
    echo -n "Checking PDB '$pdb' in namespace '$namespace': "
    disruptions_allowed=$(kubectl get poddisruptionbudgets $pdb -n $namespace -o json | \
      jq -r ".status.disruptionsAllowed")
    if [[ $disruptions_allowed == "0" ]] ; then
      warn  "Pod disruption budget '$pdb' has disruptionsAllowed at zero."
      echo   "   This means that it is not possible to upgrade the cluster without downtime."
      echo   "   Automatic draining of the node during the upgrade will not be possible."
      echo   "   Consult the documentation at https://kubernetes.io/docs/tasks/run-application/configure-pdb/ for details"
      echo   ""
    else
      echo "OK"
    fi
  done
done

# Check for volumes pending deletion
echo ""
echo "==== Checking for for volumes pending deletion..."

for namespace in $namespaces ; do
  debug "Namespace: $namespace"

  pvcs=$(kubectl get pvc -n $namespace -o json | jq -r ".items[].metadata.name")
  [[ "$pvcs" == "" || $? != 0 ]] && echo "No PVCs found in namespace '$namespace'"
  debug "PVCs: $pvcs"

  for pvc in $pvcs ; do
    echo -n "Checking PVC '$pvc' in namespace '$namespace': "
    pvc_state=$(kubectl get pvc $pvc -n $namespace | grep $pvc | awk '{print $2}')
    if [[ $pvc_state == "Terminating" ]] ; then
      warn "PersistentVolumeClaim '$pvc' is in Terminating state."
      echo "   This means that the associated volume will likely get deleted"
      echo "   during the upgrade once the node is drained."
      echo "   Consult the documentation for details:"
      echo "   https://kubernetes.io/docs/concepts/storage/persistent-volumes/#storage-object-in-use-protection"
      echo ""
    else
      echo "OK"
    fi
  done
done

# Check for PodSecurityPolicy
system_psp_names="kube-system-psp|pks-privileged|pks-restricted|a-vrops-psp|a-wavefront-psp|cert-generator|event-controller|fluent-bit|metric-controller|node-exporter|observability-manager|sink-controller|telegraf|validator|vsphere-csi-webhook"
system_psp_names_for_users="pks-privileged|pks-restricted"
# System namespaces to ignore
system_namespaces="kube-node-lease|kube-public|kube-system|nsx-system|pks-system|pks-system-host-monitoring|vmware-system-csi"

# Function to check PSP usage in RoleBindings
check_psp_usage_rolebinding() {
  local psp="$1"
  psp_binding_found=false
  while IFS= read -r line; do
    local namespace=$(echo "$line" | awk '{print $1}')
    local name=$(echo "$line" | awk '{print $2}')
    local role=$(echo "$line" | awk '{print $3}')
    # Skip system namespaces
    if [[ ! $system_namespaces =~ $namespace ]]; then
      echo "     Found RoleBinding for PSP '$psp': Namespace: $namespace, Name: $name, Role/SA: $role"
      psp_binding_found=true
    fi
  done < <(kubectl get rolebinding --all-namespaces --no-headers | grep -E "$psp")

  [[ $psp_binding_found == true ]] && return 0 || return 1
}

# Function to check PSP usage in ClusterRoleBindings
check_psp_usage_clusterrolebinding() {
  local psp="$1"
  psp_binding_found=false
  while IFS= read -r line; do
    local name=$(echo "$line" | awk '{print $1}')
    local role=$(echo "$line" | awk '{print $3}')
    echo "     Found ClusterRoleBinding for PSP '$psp': Namespace: $namespace, Role/SA: $role"
    psp_binding_found=true
  done < <(kubectl get clusterrolebinding --no-headers | grep -E "$psp")

  [[ $psp_binding_found == true ]] && return 0 || return 1
}

echo ""
echo "==== Checking PodSecurityPolicy..."

psp_output="$(kubectl get podsecuritypolicies --no-headers 2>&1 | grep -vE "^Warning" | awk '{print $1}')"
exit_status=$?

if [ $exit_status -ne 0 ] || [ -z "$psp_output" ]; then
  if [ "$major_version" -eq 1 ] && [ "$minor_version" -le 24 ]; then
    warn "No PodSecurityPolicies detected in this cluster."
    echo "   This is unusual, as a TKGi 1.15 / K8s 1.24 or earlier cluster normally should have at least system PSPs defined."
    echo "   Please check your cluster manually."
  else
    echo "No PodSecurityPolicies detected in this cluster."
    echo "   This is normal, as TKGi 1.16 / K8s 1.25 or later clusters do not support PSPs."
  fi
  exit
fi

# Convert PSP output to an array of names
IFS=$'\n' read -r -d '' -a psp_names <<< "$psp_output"

user_psps_found=false
pre_created_psp_use_found=false
for psp_name in "${psp_names[@]}"; do
  if echo "$psp_name" | grep -vEq "^($system_psp_names)"; then
    echo "   Found user-defined PodSecurityPolicy '$psp_name'"
    check_psp_usage_rolebinding "$psp_name"
    check_psp_usage_clusterrolebinding "$psp_name"
    user_psps_found=true
  elif echo "$psp_name" | grep -Eq "^($system_psp_names_for_users)"; then
    echo "   Found TKGi pre-created PodSecurityPolicy '$psp_name'"
    check_psp_usage_rolebinding "$psp_name" && pre_created_psp_use_found=true
    check_psp_usage_clusterrolebinding "$psp_name" && pre_created_psp_use_found=true
  else
    debug "Found TKGi pre-created PodSecurityPolicy '$psp_name'. This is a system policy and will be safely ignored."
  fi
done

if [[ $pre_created_psp_use_found == true ]] ; then
  warn "This cluster is found to be using TKGi pre-created PodSecurityPolicies pks-privileged and/or pks-restricted"
  echo "   Once upgraded to TKGi 1.16 / K8s 1.25, this binding will have no effect, which may result in disruption of the workloads."
else
  echo "   This cluster has not been found to be using TKGi pre-created PodSecurityPolicies pks-privileged and pks-restricted"
fi
if [[ $user_psps_found == true ]] ; then
  warn "This cluster contains PodSecurityPolicies created by the cluster operator."
  echo "   VMware does not support upgrade of clusters to TKGi 1.16 / K8s 1.25 when user-defined PSPs are still present in the cluster."
else
  echo "   This cluster has not been found to contain PodSecurityPolicies created by the cluster operator."
fi
if [[ $pre_created_psp_use_found == true ]] || [[ $user_psps_found == true ]] ; then
  echo "   PodSecurityPolicy in this cluster will need to be migrated before proceeding with upgrade to TKGi 1.16."
  echo "   Consult the documentation at https://kubernetes.io/docs/tasks/configure-pod-container/migrate-from-psp/ for details"
fi
Last Updated: