CICD for amd64 (just for this time)

jomariya23156 · Apr 1, 2024 · dbabe6b · dbabe6b
1 parent ecdf7bb
commit dbabe6b
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 21 deletions.
diff --git a/.github/workflows/build_push_docker_hub.yaml b/.github/workflows/build_push_docker_hub.yaml
@@ -1,4 +1,4 @@
-name: Publish Docker images
+name: Build and Push to Docker Hub
 
 on:
   push:
@@ -20,7 +20,7 @@ jobs:
           path: . # Upload the entire current directory
 
   build_and_push_images: # Job for building and pushing the images
-    name: Build and Push Image
+    name: Build and Push Images
     runs-on: ubuntu-latest
     needs: shared_steps # Dependency on the shared steps
     strategy:
@@ -60,7 +60,8 @@ jobs:
         with:
           context: ${{ matrix.image.context }}
           file: ${{ matrix.image.file }}
-          platforms: linux/arm64
+          # platforms: linux/arm64
+          platforms: linux/amd64
           push: true
           tags: ariya23156/sfmlops-${{ matrix.image.name }}:latest
           labels: ${{ steps.meta.outputs.labels }}
@@ -70,4 +71,4 @@ jobs:
           build-args: |
             AIRFLOW_HOME=/opt/airflow
             MLFLOW_ARTIFACT_ROOT=/storage/mlruns
-            ARCH_TRAILING_IMG_NAME=-aarch64
+            ARCH_TRAILING_IMG_NAME=""
diff --git a/.github/workflows/build_push_gke.yaml b/.github/workflows/build_push_gke.yaml
@@ -0,0 +1,63 @@
+# name: Build, Push, and Deploy to GKE
+
+# on:
+#   push:
+#     branches:
+#       - master
+
+# env:
+#   PROJECT_ID: ${{ secrets.GKE_PROJECT }}
+#   GKE_CLUSTER: sfmlops-cluster # cluster name
+#   GKE_ZONE: asia-southeast1-a # cluster zone
+#   IMAGE_TAG: ${{ github.sha }} # use commit sha as a image tag
+#   GAR_ZONE: asia-southeast1 # artifact registry zone
+#   GAR_REPO: sfmlops-registry # artifact registry repository
+
+# jobs:
+#   build_and_push_images:
+#     name: Setup, Build, Publish, and Deploy to GCP
+#     runs-on: ubuntu-latest
+#     environment: production
+#     strategy:
+#       matrix:
+#         image: [
+#           { name: web-ui, context: ./services/web-ui, file: ./services/web-ui/Dockerfile, buildargs: "" },
+#           { name: training-service, context: ./services/training-service, file: ./services/training-service/Dockerfile, buildargs: "" },
+#           { name: data-producer, context: ./services/data-producer, file: ./services/data-producer/Dockerfile, buildargs: "" },
+#           { name: mlflow, context: ./services/mlflow, file: ./services/mlflow/Dockerfile, buildargs: "" },
+#           { name: airflow-spark, context: ./services/airflow, file: ./services/airflow/Dockerfile, buildargs: "--build-arg AIRFLOW_HOME=/opt/airflow" },
+#           { name: ray, context: ./services/ray, file: ./services/ray/Dockerfile, buildargs: "--build-arg MLFLOW_ARTIFACT_ROOT=/storage/mlruns --build-arg ARCH_TRAILING_IMG_NAME=-aarch64" },
+#           { name: forecast-service, context: ./services/forecast-service, file: ./services/forecast-service/Dockerfile, buildargs: "" }
+#         ]
+#     steps:
+#     - name: Checkout
+#       uses: actions/checkout@v3
+
+#     # setup gcloud cli
+#     - name: Authenticate
+#       id: auth
+#       uses: google-github-actions/auth@v2
+#       with:
+#         credentials_json: ${{ secrets.GKE_SA_KEY }}
+
+#     # config docker to use gcloud cli tool as a credential
+#     # helper for authentication
+#     - name: Docker config
+#       run: |-
+#         gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://$GAR_ZONE-docker.pkg.dev
+
+#     # Get GKE credentials
+#     - name: Setup GKE credentials
+#       uses: google-github-actions/get-gke-credentials@v2
+#       with:
+#         cluster_name: ${{ env.GKE_CLUSTER }}
+#         location: ${{ env.GKE_ZONE }}
+
+#     - name: Build and push ${{ matrix.image.name }} Docker image
+#       run: |-
+#         docker build ${{ matrix.image.buildargs }} \
+#           --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}" \
+#           --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest" \
+#           -f ${{ matrix.image.file }} ${{ matrix.image.context }}
+#         docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}"
+#         docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest"
diff --git a/README.md b/README.md
@@ -36,32 +36,52 @@ Note: Most of the ports can be customized in the `.env` file at the root of this
 - Platform: [Docker](https://www.docker.com/), [Kubernetes](https://kubernetes.io/), [Helm](https://helm.sh/)
 
 # How to use
+Prerequisites: Docker and Kubernetes (in our case, it's Docker Desktop as pinned in *Development environment* section)
+
 ## With Docker Compose
 1. [Optional] In case you wanna build (not pulling images): `docker-compose build`
 2. `docker-compose -f docker-compose.yml -f docker-compose-airflow.yml up -d`
 3. That's it!
-
-## With Kubernetes/Helm
-*Note:* The system is quite large and heavy... I recommend running local just for testing one go, then if it works, just go to cloud if you wanna play around longer OR stick with Docker Compose (it went smoother in my case)
-1. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
-2. Both install and upgrade the main chart: `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
-3. Deploy Kafka:
+**Note:** Most of the services' restart is left unspecified, so they won't restart on failures (cuz sometimes it's quite resource-consuming during development, you see we have a poor laptop lol).
+
+## With Kubernetes/Helm (Local cluster)
+*Note:* The system is quite large and heavy... we recommend running it locally just for testing for one go, then if it works, just go off to the cloud if you wanna play around longer OR stick with Docker Compose (it went smoother in our case)
+1. Install Helm `bash install-helm.sh`
+2. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
+3. Both install and upgrade the main chart: `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
+4. Deploy Kafka:
    1. [Only 1st time] `helm repo add bitnami https://charts.bitnami.com/bitnami`
    2. `helm -n kafka upgrade --install kafka-release oci://registry-1.docker.io/bitnamicharts/kafka --create-namespace --version 23.0.7 -f values-kafka.yaml`
-4. Deploy Airflow:
+5. Deploy Airflow:
    1. [Only 1st time] `helm repo add apache-airflow https://airflow.apache.org`
    2. `helm -n airflow upgrade --install airflow apache-airflow/airflow --create-namespace --version 1.13.1 -f values-airflow.yaml`
-5. Forward Airflow UI port, so we can access: `kubectl port-forward svc/airflow-webserver 8080:8080 --namespace airflow`
+6. Forward Airflow UI port, so we can access: `kubectl port-forward svc/airflow-webserver 8080:8080 --namespace airflow`
+7. Deploy Prometheus and Grafana:
+   1. [Only 1st time] `helm repo add prometheus-community https://prometheus-community.github.io/helm-charts`
+   2. `helm -n monitoring upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack  --create-namespace --version 57.2.0 -f values-kube-prometheus.yaml`
+   3. One of the good things about kube-prometheus-stack is that it comes with many pre-installed/pre-configured dashboards for Kubernetes. Feel free to explore!
+8. That's it! Enjoy your highly-scalable Machine Learning system for Sales forecasting! :)
 
 **Note:** If you want to change namespace `kafka` and/or release name `kafka-release` of Kafka, please also change them in `values.yaml` and `KAFKA_BOOTSTRAP_SERVER` env var in `values-airflow.yaml`. They are also used in templating.
 
+**Note 2:** In Docker Compose, Ray has already been configured to pull the embedded dashboards from Grafana, but in Kubernetes, this process involves a lot more manual steps so we intentionally left it undone for ease of setup of this project. You can follow the guide [here](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html) if you want to anyways.
+
+## With Kubernetes/Helm (on GCP)
+Prerequisites: GKE Cluster (Standard cluster, *NOT* Autopilot), Artifact Registry, Service Usage API
+1. Follow this Medium blog. I recommend create a new Service Account with Owner role for quick and dirty run (but of course, please consult your cloud engineer if you have security concerns).
+2. Download your Service Account's JSON key
+3. Activate your service account: `gcloud auth activate-service-account --key-file=<path to the JSON key>`
+4. Connect local kubectl to cloud `gcloud container clusters get-credentials <GKE_CLUSTER_NAME> --zone <GKE_ZONE> --project <PROJECT_NAME>`
+5. Create a namespace for airflow in prior `kubectl create namespace airflow` because our main chart contains 1 manifest pointing to 'airflow' namespace instead of 'mlops' like the rest. On local, it works pretty fine but on GCP it seems like it cannot auto create 'airflow' namespace during helm install mlops namespace.
+
 ## Cleanup steps
 1. `helm uninstall sfmlops-helm -n mlops`
 2. `helm uninstall kafka-release -n kafka`
 3. `helm uninstall airflow -n airflow`
+4. `helm uninstall kube-prometheus-stack -n monitoring`
 
 ### Note on Kafka Docker Compose and Helm
-Kafka services on Docker Compose and Halm are different in settings, mainly in Docker Compose, we use KRaft for config management (which is newer), but in Helm, we use ZooKeeper because, honestly, I'm not managed to pull it off with KRaft, sorry :'( (It's quite complex).
+Kafka services on Docker Compose and Halm are different in settings, mainly in Docker Compose, we use KRaft for config management (which is newer), but in Helm, we use ZooKeeper because, honestly, we're not managed to pull it off with KRaft, sorry :'( (It's quite complex).
 
 ### Note on Stream processing options
 There are a few options we can do to consume the stream data from Kafka producer and save to Postgres
@@ -107,7 +127,7 @@ In fact, you can submit the training jobs directly from **ANY** service in the s
 - Offer more flexibility and customizability: With my approach, you can add as many extra steps as you like to handle and process incoming Ray job submission from client. For example, you can include more an authentication step for security purpose.
 
 
-### Using Ray with external Redis
+### Using Ray with external Redis (in Docker Compose)
 If we restart the Ray container, all previous job history will be gone because Ray store them in-memory only. We can add an external Redis to manage these variables but, from using, this seems very very unstable, this is also stated in the official doc that using external Redis supports only on-cloud / Kubernetes. But I wanna try and... from time-to-time during the development, I found that the Ray cluster do not accept the Job submission and show error `Job supervisor actor could not be scheduled: The actor is not schedulable: The node specified via NodeAffinitySchedulingStrategy doesn't exist any more or is infeasible, and soft=False was specified.`. I could fix that by removing all data in redis by running `docker-compose exec redis redis-cli FLUSHALL` AND/OR removing Ray container and rebuild it again. But it's annoying and time consuming. So in the end, I got rid of external Redis for Ray, Bye~.
 
 ## References

diff --git a/docker-compose-airflow.yml b/docker-compose-airflow.yml
@@ -101,7 +101,7 @@ services:
       timeout: 30s
       retries: 50
       start_period: 30s
-    restart: always
+    # restart: always
     networks:
       - forecast-network
 
@@ -116,7 +116,7 @@ services:
       timeout: 10s
       retries: 5
       start_period: 30s
-    restart: always
+    # restart: always
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:
@@ -131,7 +131,7 @@ services:
       timeout: 10s
       retries: 5
       start_period: 30s
-    restart: always
+    # restart: always
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:
@@ -154,7 +154,7 @@ services:
       # Required to handle warm shutdown of the celery workers properly
       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
       DUMB_INIT_SETSID: "0"
-    restart: always
+    # restart: always
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:
@@ -169,7 +169,7 @@ services:
       timeout: 10s
       retries: 5
       start_period: 30s
-    restart: always
+    # restart: always
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:
@@ -268,7 +268,7 @@ services:
       timeout: 10s
       retries: 5
       start_period: 30s
-    restart: always
+    # restart: always
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:

diff --git a/install-helm.sh b/install-helm.sh
@@ -0,0 +1,5 @@
+#! /bin/bash
+
+curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
+chmod 700 get_helm.sh && \
+./get_helm.sh
diff --git a/sfmlops-helm/Chart.yaml b/sfmlops-helm/Chart.yaml
@@ -25,7 +25,7 @@ appVersion: "1.16.0"
 
 dependencies:
 - name: ingress-nginx
-  version: 4.10.0   # Specify the version you want
+  version: 4.10.0
   repository: https://kubernetes.github.io/ingress-nginx
 - name: kuberay-operator
   version: 1.1.0-rc.0

diff --git a/sfmlops-helm/values-kube-prometheus.yaml b/sfmlops-helm/values-kube-prometheus.yaml
@@ -0,0 +1,10 @@
+prometheus:
+  service:
+    type: LoadBalancer
+
+## Grafana default port is 80 which can conflict with our main nginx
+## if we are running locally, so just leave it as default
+grafana:
+  adminPassword: admin
+  # service:
+  #   type: LoadBalancer