Skip to content

Commit

Permalink
CICD for amd64 (just for this time)
Browse files Browse the repository at this point in the history
  • Loading branch information
jomariya23156 committed Apr 1, 2024
1 parent ecdf7bb commit dbabe6b
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 21 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/build_push_docker_hub.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Publish Docker images
name: Build and Push to Docker Hub

on:
push:
Expand All @@ -20,7 +20,7 @@ jobs:
path: . # Upload the entire current directory

build_and_push_images: # Job for building and pushing the images
name: Build and Push Image
name: Build and Push Images
runs-on: ubuntu-latest
needs: shared_steps # Dependency on the shared steps
strategy:
Expand Down Expand Up @@ -60,7 +60,8 @@ jobs:
with:
context: ${{ matrix.image.context }}
file: ${{ matrix.image.file }}
platforms: linux/arm64
# platforms: linux/arm64
platforms: linux/amd64
push: true
tags: ariya23156/sfmlops-${{ matrix.image.name }}:latest
labels: ${{ steps.meta.outputs.labels }}
Expand All @@ -70,4 +71,4 @@ jobs:
build-args: |
AIRFLOW_HOME=/opt/airflow
MLFLOW_ARTIFACT_ROOT=/storage/mlruns
ARCH_TRAILING_IMG_NAME=-aarch64
ARCH_TRAILING_IMG_NAME=""
63 changes: 63 additions & 0 deletions .github/workflows/build_push_gke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# name: Build, Push, and Deploy to GKE

# on:
# push:
# branches:
# - master

# env:
# PROJECT_ID: ${{ secrets.GKE_PROJECT }}
# GKE_CLUSTER: sfmlops-cluster # cluster name
# GKE_ZONE: asia-southeast1-a # cluster zone
# IMAGE_TAG: ${{ github.sha }} # use commit sha as a image tag
# GAR_ZONE: asia-southeast1 # artifact registry zone
# GAR_REPO: sfmlops-registry # artifact registry repository

# jobs:
# build_and_push_images:
# name: Setup, Build, Publish, and Deploy to GCP
# runs-on: ubuntu-latest
# environment: production
# strategy:
# matrix:
# image: [
# { name: web-ui, context: ./services/web-ui, file: ./services/web-ui/Dockerfile, buildargs: "" },
# { name: training-service, context: ./services/training-service, file: ./services/training-service/Dockerfile, buildargs: "" },
# { name: data-producer, context: ./services/data-producer, file: ./services/data-producer/Dockerfile, buildargs: "" },
# { name: mlflow, context: ./services/mlflow, file: ./services/mlflow/Dockerfile, buildargs: "" },
# { name: airflow-spark, context: ./services/airflow, file: ./services/airflow/Dockerfile, buildargs: "--build-arg AIRFLOW_HOME=/opt/airflow" },
# { name: ray, context: ./services/ray, file: ./services/ray/Dockerfile, buildargs: "--build-arg MLFLOW_ARTIFACT_ROOT=/storage/mlruns --build-arg ARCH_TRAILING_IMG_NAME=-aarch64" },
# { name: forecast-service, context: ./services/forecast-service, file: ./services/forecast-service/Dockerfile, buildargs: "" }
# ]
# steps:
# - name: Checkout
# uses: actions/checkout@v3

# # setup gcloud cli
# - name: Authenticate
# id: auth
# uses: google-github-actions/auth@v2
# with:
# credentials_json: ${{ secrets.GKE_SA_KEY }}

# # config docker to use gcloud cli tool as a credential
# # helper for authentication
# - name: Docker config
# run: |-
# gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://$GAR_ZONE-docker.pkg.dev

# # Get GKE credentials
# - name: Setup GKE credentials
# uses: google-github-actions/get-gke-credentials@v2
# with:
# cluster_name: ${{ env.GKE_CLUSTER }}
# location: ${{ env.GKE_ZONE }}

# - name: Build and push ${{ matrix.image.name }} Docker image
# run: |-
# docker build ${{ matrix.image.buildargs }} \
# --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}" \
# --tag "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest" \
# -f ${{ matrix.image.file }} ${{ matrix.image.context }}
# docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:${{ github.sha }}"
# docker push "$GAR_ZONE-docker.pkg.dev/$PROJECT_ID/$GAR_REPO/${{ matrix.image.name }}:latest"
40 changes: 30 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,52 @@ Note: Most of the ports can be customized in the `.env` file at the root of this
- Platform: [Docker](https://www.docker.com/), [Kubernetes](https://kubernetes.io/), [Helm](https://helm.sh/)

# How to use
Prerequisites: Docker and Kubernetes (in our case, it's Docker Desktop as pinned in *Development environment* section)

## With Docker Compose
1. [Optional] In case you wanna build (not pulling images): `docker-compose build`
2. `docker-compose -f docker-compose.yml -f docker-compose-airflow.yml up -d`
3. That's it!

## With Kubernetes/Helm
*Note:* The system is quite large and heavy... I recommend running local just for testing one go, then if it works, just go to cloud if you wanna play around longer OR stick with Docker Compose (it went smoother in my case)
1. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
2. Both install and upgrade the main chart: `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
3. Deploy Kafka:
**Note:** Most of the services' restart is left unspecified, so they won't restart on failures (cuz sometimes it's quite resource-consuming during development, you see we have a poor laptop lol).

## With Kubernetes/Helm (Local cluster)
*Note:* The system is quite large and heavy... we recommend running it locally just for testing for one go, then if it works, just go off to the cloud if you wanna play around longer OR stick with Docker Compose (it went smoother in our case)
1. Install Helm `bash install-helm.sh`
2. `cd sfmlops-helm` and `helm dependency build` to fetch all dependencies
3. Both install and upgrade the main chart: `helm upgrade --install --create-namespace -n mlops sfmlops-helm ./ -f values.yaml -f values-ray.yaml`
4. Deploy Kafka:
1. [Only 1st time] `helm repo add bitnami https://charts.bitnami.com/bitnami`
2. `helm -n kafka upgrade --install kafka-release oci://registry-1.docker.io/bitnamicharts/kafka --create-namespace --version 23.0.7 -f values-kafka.yaml`
4. Deploy Airflow:
5. Deploy Airflow:
1. [Only 1st time] `helm repo add apache-airflow https://airflow.apache.org`
2. `helm -n airflow upgrade --install airflow apache-airflow/airflow --create-namespace --version 1.13.1 -f values-airflow.yaml`
5. Forward Airflow UI port, so we can access: `kubectl port-forward svc/airflow-webserver 8080:8080 --namespace airflow`
6. Forward Airflow UI port, so we can access: `kubectl port-forward svc/airflow-webserver 8080:8080 --namespace airflow`
7. Deploy Prometheus and Grafana:
1. [Only 1st time] `helm repo add prometheus-community https://prometheus-community.github.io/helm-charts`
2. `helm -n monitoring upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack --create-namespace --version 57.2.0 -f values-kube-prometheus.yaml`
3. One of the good things about kube-prometheus-stack is that it comes with many pre-installed/pre-configured dashboards for Kubernetes. Feel free to explore!
8. That's it! Enjoy your highly-scalable Machine Learning system for Sales forecasting! :)

**Note:** If you want to change namespace `kafka` and/or release name `kafka-release` of Kafka, please also change them in `values.yaml` and `KAFKA_BOOTSTRAP_SERVER` env var in `values-airflow.yaml`. They are also used in templating.

**Note 2:** In Docker Compose, Ray has already been configured to pull the embedded dashboards from Grafana, but in Kubernetes, this process involves a lot more manual steps so we intentionally left it undone for ease of setup of this project. You can follow the guide [here](https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html) if you want to anyways.

## With Kubernetes/Helm (on GCP)
Prerequisites: GKE Cluster (Standard cluster, *NOT* Autopilot), Artifact Registry, Service Usage API
1. Follow this Medium blog. I recommend create a new Service Account with Owner role for quick and dirty run (but of course, please consult your cloud engineer if you have security concerns).
2. Download your Service Account's JSON key
3. Activate your service account: `gcloud auth activate-service-account --key-file=<path to the JSON key>`
4. Connect local kubectl to cloud `gcloud container clusters get-credentials <GKE_CLUSTER_NAME> --zone <GKE_ZONE> --project <PROJECT_NAME>`
5. Create a namespace for airflow in prior `kubectl create namespace airflow` because our main chart contains 1 manifest pointing to 'airflow' namespace instead of 'mlops' like the rest. On local, it works pretty fine but on GCP it seems like it cannot auto create 'airflow' namespace during helm install mlops namespace.

## Cleanup steps
1. `helm uninstall sfmlops-helm -n mlops`
2. `helm uninstall kafka-release -n kafka`
3. `helm uninstall airflow -n airflow`
4. `helm uninstall kube-prometheus-stack -n monitoring`

### Note on Kafka Docker Compose and Helm
Kafka services on Docker Compose and Halm are different in settings, mainly in Docker Compose, we use KRaft for config management (which is newer), but in Helm, we use ZooKeeper because, honestly, I'm not managed to pull it off with KRaft, sorry :'( (It's quite complex).
Kafka services on Docker Compose and Halm are different in settings, mainly in Docker Compose, we use KRaft for config management (which is newer), but in Helm, we use ZooKeeper because, honestly, we're not managed to pull it off with KRaft, sorry :'( (It's quite complex).

### Note on Stream processing options
There are a few options we can do to consume the stream data from Kafka producer and save to Postgres
Expand Down Expand Up @@ -107,7 +127,7 @@ In fact, you can submit the training jobs directly from **ANY** service in the s
- Offer more flexibility and customizability: With my approach, you can add as many extra steps as you like to handle and process incoming Ray job submission from client. For example, you can include more an authentication step for security purpose.


### Using Ray with external Redis
### Using Ray with external Redis (in Docker Compose)
If we restart the Ray container, all previous job history will be gone because Ray store them in-memory only. We can add an external Redis to manage these variables but, from using, this seems very very unstable, this is also stated in the official doc that using external Redis supports only on-cloud / Kubernetes. But I wanna try and... from time-to-time during the development, I found that the Ray cluster do not accept the Job submission and show error `Job supervisor actor could not be scheduled: The actor is not schedulable: The node specified via NodeAffinitySchedulingStrategy doesn't exist any more or is infeasible, and soft=False was specified.`. I could fix that by removing all data in redis by running `docker-compose exec redis redis-cli FLUSHALL` AND/OR removing Ray container and rebuild it again. But it's annoying and time consuming. So in the end, I got rid of external Redis for Ray, Bye~.

## References
Expand Down
12 changes: 6 additions & 6 deletions docker-compose-airflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ services:
timeout: 30s
retries: 50
start_period: 30s
restart: always
# restart: always
networks:
- forecast-network

Expand All @@ -116,7 +116,7 @@ services:
timeout: 10s
retries: 5
start_period: 30s
restart: always
# restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
Expand All @@ -131,7 +131,7 @@ services:
timeout: 10s
retries: 5
start_period: 30s
restart: always
# restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
Expand All @@ -154,7 +154,7 @@ services:
# Required to handle warm shutdown of the celery workers properly
# See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
DUMB_INIT_SETSID: "0"
restart: always
# restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
Expand All @@ -169,7 +169,7 @@ services:
timeout: 10s
retries: 5
start_period: 30s
restart: always
# restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
Expand Down Expand Up @@ -268,7 +268,7 @@ services:
timeout: 10s
retries: 5
start_period: 30s
restart: always
# restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
Expand Down
5 changes: 5 additions & 0 deletions install-helm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! /bin/bash

curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
chmod 700 get_helm.sh && \
./get_helm.sh
2 changes: 1 addition & 1 deletion sfmlops-helm/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ appVersion: "1.16.0"

dependencies:
- name: ingress-nginx
version: 4.10.0 # Specify the version you want
version: 4.10.0
repository: https://kubernetes.github.io/ingress-nginx
- name: kuberay-operator
version: 1.1.0-rc.0
Expand Down
10 changes: 10 additions & 0 deletions sfmlops-helm/values-kube-prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
prometheus:
service:
type: LoadBalancer

## Grafana default port is 80 which can conflict with our main nginx
## if we are running locally, so just leave it as default
grafana:
adminPassword: admin
# service:
# type: LoadBalancer

0 comments on commit dbabe6b

Please sign in to comment.