diff --git a/.gitignore b/.gitignore index 6ef5822c8..0635abd98 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,10 @@ cscope.* /bazel-* *.pyc + +# Helm chart dependecies cache +**/Chart.lock +**/charts/*.tgz + +# Helm chart output directory +ai/ai-starter-kit/out \ No newline at end of file diff --git a/ai/ai-starter-kit/Makefile b/ai/ai-starter-kit/Makefile new file mode 100644 index 000000000..fa9a7e24f --- /dev/null +++ b/ai/ai-starter-kit/Makefile @@ -0,0 +1,62 @@ +.PHONY: check_hf_token check_OCI_target package_helm lint dep_update install install_gke start uninstall push_helm + +check_hf_token: +ifndef HF_TOKEN + $(error HF_TOKEN is not set) +endif + +check_OCI_target: +ifndef OCI_HELM_TARGET + $(error OCI_HELM_TARGET is not set) +endif + +package_helm: + helm package helm-chart/ai-starter-kit/ --destination out/ + +push_helm: check_OCI_target + helm push out/ai-starter-kit* oci://$$OCI_HELM_TARGET + +lint: + helm lint helm-chart/ai-starter-kit + +dep_update: + helm dependency update helm-chart/ai-starter-kit + +install: check_hf_token + helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values.yaml + +start: + mkdir -p "${HOME}/models-cache" + minikube start --cpus 4 --memory 15000 --mount --mount-string="${HOME}/models-cache:/tmp/models-cache" + +start_gpu: + mkdir -p "${HOME}/models-cache" + minikube start --driver krunkit --cpus 4 --memory 15000 --mount --mount-string="${HOME}/models-cache:/tmp/models-cache" + +uninstall: + helm uninstall ai-starter-kit + +destroy: + minikube delete + +validate_jupyterhub: + kubectl get pods; \ + kubectl wait --for=condition=Ready pods -l 'component!=continuous-image-puller' --timeout=1800s; \ + kubectl get pods; \ + kubectl get services; \ + kubectl port-forward service/ai-starter-kit-jupyterhub-proxy-public 8081:80 & \ + PID=$$!; \ + echo "Port-forward PID=$${PID}"; \ + sleep 5s; \ + python3 ./ci/test_hub.py "127.0.0.1:8081"; \ + kill $$PID + +validate_ray: + kubectl wait --for=condition=Ready pods -l 'app.kubernetes.io/created-by=kuberay-operator' --timeout=1800s; \ + kubectl get pods; \ + kubectl get services; \ + kubectl port-forward service/ai-starter-kit-kuberay-head-svc 8265:8265 & \ + PID=$$!; \ + sleep 10s; \ + ray job submit --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"; \ + kill $$PID diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/.helmignore b/ai/ai-starter-kit/helm-chart/ai-starter-kit/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml new file mode 100644 index 000000000..9bf77a3b5 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml @@ -0,0 +1,45 @@ +apiVersion: v2 +name: ai-starter-kit +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.1.0" + + +dependencies: + - name: kuberay-operator + condition: ray-cluster.enabled + version: "1.3.0" + repository: "https://ray-project.github.io/kuberay-helm" + - condition: ray-cluster.enabled + name: ray-cluster + version: "1.3.0" + repository: "https://ray-project.github.io/kuberay-helm" + - name: jupyterhub + version: "4.2.0" + repository: "https://hub.jupyter.org/helm-chart/" + - name: mlflow + version: "0.12.0" + repository: "https://community-charts.github.io/helm-charts" + - name: ollama + condition: ollama.enabled + version: "1.27.0" + repository: "https://helm.otwld.com" diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/README.md b/ai/ai-starter-kit/helm-chart/ai-starter-kit/README.md new file mode 100644 index 000000000..bc961f2e3 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/README.md @@ -0,0 +1,271 @@ +# AI Starter Kit + +A comprehensive Helm chart for deploying a complete AI/ML development environment on Kubernetes. This starter kit provides a ready-to-use platform with JupyterHub notebooks, model serving capabilities, and experiment tracking - perfect for teams starting their AI journey or prototyping AI applications. + +## Purpose + +The AI Starter Kit simplifies the deployment of AI infrastructure by providing: + +- **JupyterHub**: Multi-user notebook environment with pre-configured AI/ML libraries +- **Model Serving**: Support for both Ollama and Ramalama model servers +- **MLflow**: Experiment tracking and model management +- **Model Caching**: Persistent storage for efficient model management +- **Example Notebooks**: Pre-loaded notebooks to get you started immediately + +## Prerequisites + +### General Requirements +- Kubernetes cluster (minikube, GKE) +- Helm 3.x installed +- kubectl configured to access your cluster +- Hugging Face token for accessing models + +### Platform-Specific Requirements + +#### Minikube (Local Development) +- Docker Desktop or similar container runtime +- Minimum 4 CPU cores and 16GB RAM available +- 40GB+ free disk space + +## Installation + +### Quick Start (Minikube) + +1. **Create a folder for the persistent storage:** +```bash +mkdir -p /$HOME/models-cache +``` + +2. **Start minikube with persistent storage:** +```bash +minikube start --cpus 4 --memory 15000 \ + --mount --mount-string="/$HOME/models-cache:/tmp/models-cache" +``` + +3. **Install the chart:** + +Inside of the checked out git repository run: + +```bash +cd ai/ai-starter-kit/helm-chart/ai-starter-kit +helm dependency update +helm install ai-starter-kit . \ + --set huggingface.token="YOUR_HF_TOKEN" \ + -f values.yaml +``` + +4. **Access JupyterHub:** +```bash +kubectl port-forward svc/ai-starter-kit-jupyterhub-proxy-public 8080:80 +``` +Navigate to http://localhost:8080 and login with any username and password `password` + +### Makefile Commands + +The project includes a Makefile with convenient commands for managing the deployment: + +#### Development & Testing + +```bash +make start # Start minikube with mounted model cache (4 CPU, 15GB RAM) +make start_gpu # Start minikube with GPU support using krunkit driver +make lint # Run Helm chart linting to validate syntax +make dep_update # Update Helm chart dependencies +``` + +#### Installation & Management + +```bash +make install HF_TOKEN= # Install/upgrade the chart (required: HF_TOKEN env var) +make uninstall # Uninstall the ai-starter-kit release +make destroy # Delete the entire minikube cluster +``` + +#### Validation + +```bash +make validate_jupyterhub # Verify JupyterHub deployment and run connectivity test +make validate_ray # Verify Ray cluster deployment and submit test job +``` + +#### Distribution + +```bash +make package_helm # Package chart into tarball (output: out/) +make push_helm OCI_HELM_TARGET= # Push packaged chart to OCI registry +``` + +#### Required Environment Variables + +- `HF_TOKEN`: HuggingFace token (for `make install`) +- `OCI_HELM_TARGET`: OCI registry path (for `make push_helm`) + +## Configuration + +### Key Configuration Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `huggingface.token` | HuggingFace token for models | `"YOUR_HF_TOKEN"` | +| `ollama.enabled` | Enable Ollama model server | `true` | +| `ramalama.enabled` | Enable Ramalama model server | `true` | +| `modelsCachePvc.size` | Size of model cache storage | `10Gi` | +| `jupyterhub.singleuser.defaultUrl` | Default notebook path | `/lab/tree/welcome.ipynb` | +| `ray-cluster.enabled` | Enable Ray operator and server | `false` | + +### Storage Configuration + +The chart supports different storage configurations: + +- **Local Development**: Uses hostPath volumes with minikube mount +- **Custom**: Configure via `modelsCachePvc.storageClassName` + +### Using GPUs + +In order to use GPUs for AI/ML workloads we need to add the necessary config to the services. Check the dependency charts documentation for the values. The following example uses GKE nodeSelector as an example. For instance jupyterhub config could be: + + ```yaml + juypterhub: + ... + extraResource: + limits: + nvidia.com/gpu: 1 + guarantees: + nvidia.com/gpu: 1 + + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 + ``` + +### Model Servers + +#### Ollama +Ollama is enabled by default and provides: +- Easy model management +- REST API for inference +- Support for popular models (Llama, Gemma, Qwen, etc.) +- GPU acceleration support + +#### Ramalama +Ramalama provides: +- Alternative model serving solution +- Support for CUDA and Metal (macOS) acceleration +- Lightweight deployment option + + + +## Usage + +### Accessing Services + +#### JupyterHub +```bash +# Port forward to access JupyterHub +kubectl port-forward svc/ai-starter-kit-jupyterhub-proxy-public 8080:80 +# Access at: http://localhost:8080 +# Default password: password +``` + +#### MLflow +```bash +# Port forward to access MLflow UI +kubectl port-forward svc/ai-starter-kit-mlflow 5000:5000 +# Access at: http://localhost:5000 +``` + +#### Ollama/Ramalama API +```bash +# For Ollama +kubectl port-forward svc/ai-starter-kit-ollama 11434:11434 + +# For Ramalama +kubectl port-forward svc/ai-starter-kit-ramalama 8080:8080 +``` + +### Pre-loaded Example Notebooks + +The JupyterHub environment comes with pre-loaded example notebooks: +- `ray.ipynb`: Simple Ray nad MLflow example +- `chat_bot.ipynb`: Simple chatbot interface using Ollama for conversational AI. +- `multi-agent.ipynb`: Multi-agent workflow demonstration using Ray. +- `multi-agent-ollama.ipynb`: Similar multi-agent workflow demonstration using Ollama. +- `multi-agent-ramalama.ipynb`: Similar multi-agent workflow using RamaLama runtime for comparison. +- `welcome.ipynb`: Introduction notebook with embedding model examples using Qwen models. + +These notebooks are automatically copied to your workspace on first login. + +## Architecture + +The AI Starter Kit consists of: + +1. **JupyterHub**: Multi-user notebook server with persistent storage +2. **Model Serving**: Choice of Ollama or Ramalama for LLM inference +3. **MLflow**: Experiment tracking and model registry +4. **Persistent Storage**: Shared model cache to avoid redundant downloads +5. **Init Containers**: Automated setup of models and notebooks + +## Cleanup + +### Uninstall the chart +```bash +helm uninstall ai-starter-kit +``` + +### Delete persistent volumes (optional) +```bash +kubectl delete pvc ai-starter-kit-models-cache-pvc +kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir +``` + +### Stop minikube +```bash +minikube stop +minikube delete # To completely remove the cluster +``` + +## Troubleshooting + +### Common Issues + +#### Pods stuck in Pending state +- Check available resources: `kubectl describe pod ` +- Increase cluster resources or reduce resource requests + +#### Model download failures +- Verify Hugging Face token is set correctly +- Check internet connectivity from pods +- Increase init container timeout in values + +#### GPU not detected +- Verify GPU nodes are available: `kubectl get nodes -o wide` +- Check GPU driver installation +- Ensure correct node selectors and tolerations + +#### Storage issues +- Verify PVC is bound: `kubectl get pvc` +- Check storage class availability: `kubectl get storageclass` +- Ensure sufficient disk space + +### Debug Commands +```bash +# Check pod status +kubectl get pods -n default + +# View pod logs +kubectl logs -f + +# Describe pod for events +kubectl describe pod + +# Check resource usage +kubectl top nodes +kubectl top pods +``` + +## Resources + +- [JupyterHub Documentation](https://jupyterhub.readthedocs.io/) +- [MLflow Documentation](https://mlflow.org/docs/latest/index.html) +- [Ollama Documentation](https://ollama.ai/docs) +- [Kubernetes Documentation](https://kubernetes.io/docs/) +- [Helm Documentation](https://helm.sh/docs/) \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/chat_bot.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/chat_bot.ipynb new file mode 100644 index 000000000..0834cf6c3 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/chat_bot.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e9e3dd59-b4d9-4de5-a6aa-a72d1480ac77", + "metadata": {}, + "outputs": [], + "source": [ + "from ollama import Client\n", + "\n", + "client = Client(\n", + " host='http://ai-starter-kit-ollama:11434',\n", + " headers={'x-some-header': 'some-value'}\n", + ")\n", + "\n", + "def get_response(prompt):\n", + " response = client.chat(model='gemma3', messages=[\n", + " {\n", + " 'role': 'user',\n", + " 'content': prompt,\n", + " },\n", + " ])\n", + " return response.message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dd1513d4-18c5-46d7-8260-f90be004d315", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n const py_version = '3.7.3'.replace('rc', '-rc.').replace('.dev', '-dev.');\n const reloading = false;\n const Bokeh = root.Bokeh;\n\n // Set a timeout for this load but only if we are not already initializing\n if (typeof (root._bokeh_timeout) === \"undefined\" || (force || !root._bokeh_is_initializing)) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks;\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n if (js_modules == null) js_modules = [];\n if (js_exports == null) js_exports = {};\n\n root._bokeh_onload_callbacks.push(callback);\n\n if (root._bokeh_is_loading > 0) {\n // Don't load bokeh if it is still initializing\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n } else if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n // There is nothing to load\n run_callbacks();\n return null;\n }\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n window._bokeh_on_load = on_load\n\n function on_error(e) {\n const src_el = e.srcElement\n console.error(\"failed to load \" + (src_el.href || src_el.src));\n }\n\n const skip = [];\n if (window.requirejs) {\n window.requirejs.config({'packages': {}, 'paths': {}, 'shim': {}});\n root._bokeh_is_loading = css_urls.length + 0;\n } else {\n root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n }\n\n const existing_stylesheets = []\n const links = document.getElementsByTagName('link')\n for (let i = 0; i < links.length; i++) {\n const link = links[i]\n if (link.href != null) {\n existing_stylesheets.push(link.href)\n }\n }\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const escaped = encodeURI(url)\n if (existing_stylesheets.indexOf(escaped) !== -1) {\n on_load()\n continue;\n }\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n } var existing_scripts = []\n const scripts = document.getElementsByTagName('script')\n for (let i = 0; i < scripts.length; i++) {\n var script = scripts[i]\n if (script.src != null) {\n existing_scripts.push(script.src)\n }\n }\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const escaped = encodeURI(url)\n if (skip.indexOf(escaped) !== -1 || existing_scripts.indexOf(escaped) !== -1) {\n if (!window.requirejs) {\n on_load();\n }\n continue;\n }\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (let i = 0; i < js_modules.length; i++) {\n const url = js_modules[i];\n const escaped = encodeURI(url)\n if (skip.indexOf(escaped) !== -1 || existing_scripts.indexOf(escaped) !== -1) {\n if (!window.requirejs) {\n on_load();\n }\n continue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (const name in js_exports) {\n const url = js_exports[name];\n const escaped = encodeURI(url)\n if (skip.indexOf(escaped) >= 0 || root[name] != null) {\n if (!window.requirejs) {\n on_load();\n }\n continue;\n }\n var element = document.createElement('script');\n element.onerror = on_error;\n element.async = false;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n element.textContent = `\n import ${name} from \"${url}\"\n window.${name} = ${name}\n window._bokeh_on_load()\n `\n document.head.appendChild(element);\n }\n if (!js_urls.length && !js_modules.length) {\n on_load()\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.holoviz.org/panel/1.7.5/dist/bundled/reactiveesm/es-module-shims@^1.10.0/dist/es-module-shims.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-3.7.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.7.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.7.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.7.3.min.js\", \"https://cdn.holoviz.org/panel/1.7.5/dist/panel.min.js\"];\n const js_modules = [];\n const js_exports = {};\n const css_urls = [];\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n if ((root.Bokeh !== undefined) || (force === true)) {\n for (let i = 0; i < inline_js.length; i++) {\n try {\n inline_js[i].call(root, root.Bokeh);\n } catch(e) {\n if (!reloading) {\n throw e;\n }\n }\n }\n // Cache old bokeh versions\n if (Bokeh != undefined && !reloading) {\n var NewBokeh = root.Bokeh;\n if (Bokeh.versions === undefined) {\n Bokeh.versions = new Map();\n }\n if (NewBokeh.version !== Bokeh.version) {\n Bokeh.versions.set(NewBokeh.version, NewBokeh)\n }\n root.Bokeh = Bokeh;\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n }\n root._bokeh_is_initializing = false\n }\n\n function load_or_wait() {\n // Implement a backoff loop that tries to ensure we do not load multiple\n // versions of Bokeh and its dependencies at the same time.\n // In recent versions we use the root._bokeh_is_initializing flag\n // to determine whether there is an ongoing attempt to initialize\n // bokeh, however for backward compatibility we also try to ensure\n // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n // before older versions are fully initialized.\n if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n // If the timeout and bokeh was not successfully loaded we reset\n // everything and try loading again\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_is_initializing = false;\n root._bokeh_onload_callbacks = undefined;\n root._bokeh_is_loading = 0\n console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n load_or_wait();\n } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n setTimeout(load_or_wait, 100);\n } else {\n root._bokeh_is_initializing = true\n root._bokeh_onload_callbacks = []\n const bokeh_loaded = root.Bokeh != null && (root.Bokeh.version === py_version || (root.Bokeh.versions !== undefined && root.Bokeh.versions.has(py_version)));\n if (!reloading && !bokeh_loaded) {\n if (root.Bokeh) {\n root.Bokeh = undefined;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n }\n load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n }\n // Give older versions of the autoload script a head-start to ensure\n // they initialize before we start loading newer version.\n setTimeout(load_or_wait, 100)\n}(window));", + "application/vnd.holoviews_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "\nif ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n}\n\n\n function JupyterCommManager() {\n }\n\n JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n comm_manager.register_target(comm_id, function(comm) {\n comm.on_msg(msg_handler);\n });\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n comm.onMsg = msg_handler;\n });\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data, comm_id};\n var buffers = []\n for (var buffer of message.buffers || []) {\n buffers.push(new DataView(buffer))\n }\n var metadata = message.metadata || {};\n var msg = {content, buffers, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n })\n }\n }\n\n JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n if (comm_id in window.PyViz.comms) {\n return window.PyViz.comms[comm_id];\n } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n if (msg_handler) {\n comm.on_msg(msg_handler);\n }\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n let retries = 0;\n const open = () => {\n if (comm.active) {\n comm.open();\n } else if (retries > 3) {\n console.warn('Comm target never activated')\n } else {\n retries += 1\n setTimeout(open, 500)\n }\n }\n if (comm.active) {\n comm.open();\n } else {\n setTimeout(open, 500)\n }\n if (msg_handler) {\n comm.onMsg = msg_handler;\n }\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n var comm_promise = google.colab.kernel.comms.open(comm_id)\n comm_promise.then((comm) => {\n window.PyViz.comms[comm_id] = comm;\n if (msg_handler) {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data};\n var metadata = message.metadata || {comm_id};\n var msg = {content, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n }\n })\n var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n return comm_promise.then((comm) => {\n comm.send(data, metadata, buffers, disposeOnDone);\n });\n };\n var comm = {\n send: sendClosure\n };\n }\n window.PyViz.comms[comm_id] = comm;\n return comm;\n }\n window.PyViz.comm_manager = new JupyterCommManager();\n \n\n\nvar JS_MIME_TYPE = 'application/javascript';\nvar HTML_MIME_TYPE = 'text/html';\nvar EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\nvar CLASS_NAME = 'output';\n\n/**\n * Render data to the DOM node\n */\nfunction render(props, node) {\n var div = document.createElement(\"div\");\n var script = document.createElement(\"script\");\n node.appendChild(div);\n node.appendChild(script);\n}\n\n/**\n * Handle when a new output is added\n */\nfunction handle_add_output(event, handle) {\n var output_area = handle.output_area;\n var output = handle.output;\n if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n return\n }\n var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n if (id !== undefined) {\n var nchildren = toinsert.length;\n var html_node = toinsert[nchildren-1].children[0];\n html_node.innerHTML = output.data[HTML_MIME_TYPE];\n var scripts = [];\n var nodelist = html_node.querySelectorAll(\"script\");\n for (var i in nodelist) {\n if (nodelist.hasOwnProperty(i)) {\n scripts.push(nodelist[i])\n }\n }\n\n scripts.forEach( function (oldScript) {\n var newScript = document.createElement(\"script\");\n var attrs = [];\n var nodemap = oldScript.attributes;\n for (var j in nodemap) {\n if (nodemap.hasOwnProperty(j)) {\n attrs.push(nodemap[j])\n }\n }\n attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n oldScript.parentNode.replaceChild(newScript, oldScript);\n });\n if (JS_MIME_TYPE in output.data) {\n toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n }\n output_area._hv_plot_id = id;\n if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n window.PyViz.plot_index[id] = Bokeh.index[id];\n } else {\n window.PyViz.plot_index[id] = null;\n }\n } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n var bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n var script_attrs = bk_div.children[0].attributes;\n for (var i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n}\n\n/**\n * Handle when an output is cleared or removed\n */\nfunction handle_clear_output(event, handle) {\n var id = handle.cell.output_area._hv_plot_id;\n var server_id = handle.cell.output_area._bokeh_server_id;\n if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n if (server_id !== null) {\n comm.send({event_type: 'server_delete', 'id': server_id});\n return;\n } else if (comm !== null) {\n comm.send({event_type: 'delete', 'id': id});\n }\n delete PyViz.plot_index[id];\n if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n var doc = window.Bokeh.index[id].model.document\n doc.clear();\n const i = window.Bokeh.documents.indexOf(doc);\n if (i > -1) {\n window.Bokeh.documents.splice(i, 1);\n }\n }\n}\n\n/**\n * Handle kernel restart event\n */\nfunction handle_kernel_cleanup(event, handle) {\n delete PyViz.comms[\"hv-extension-comm\"];\n window.PyViz.plot_index = {}\n}\n\n/**\n * Handle update_display_data messages\n */\nfunction handle_update_output(event, handle) {\n handle_clear_output(event, {cell: {output_area: handle.output_area}})\n handle_add_output(event, handle)\n}\n\nfunction register_renderer(events, OutputArea) {\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n var toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[0]);\n element.append(toinsert);\n return toinsert\n }\n\n events.on('output_added.OutputArea', handle_add_output);\n events.on('output_updated.OutputArea', handle_update_output);\n events.on('clear_output.CodeCell', handle_clear_output);\n events.on('delete.Cell', handle_clear_output);\n events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n safe: true,\n index: 0\n });\n}\n\nif (window.Jupyter !== undefined) {\n try {\n var events = require('base/js/events');\n var OutputArea = require('notebook/js/outputarea').OutputArea;\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n } catch(err) {\n }\n}\n", + "application/vnd.holoviews_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "b6fd14e0-f8d2-46e7-9c4d-722893d04d7e" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Column\n", + " [0] TextInput(placeholder='Enter text here…')\n", + " [1] Row\n", + " [0] Button(name='Chat!')\n", + " [2] ParamFunction(function, _pane=Column, defer_load=False, height=300, loading_indicator=True, sizing_mode='fixed', width=300)" + ] + }, + "execution_count": 2, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "2854d6b0-689d-4dc0-8861-1834489708e9" + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "import panel as pn # GUI\n", + "pn.extension()\n", + "\n", + "panels = [] # collect display \n", + "context = [ ] # accumulate messages\n", + "\n", + "\n", + "def collect_messages(_):\n", + " prompt = inp.value_input\n", + " inp.value = ''\n", + " if (not prompt):\n", + " return pn.Column(*panels)\n", + "\n", + " response = get_response(prompt)\n", + " context.append({'role':'user', 'content':f\"{prompt}\"})\n", + " context.append({'role':'assistant', 'content':f\"{response}\"})\n", + " panels.append(\n", + " pn.Row('User:', pn.pane.Markdown(prompt, width=600)))\n", + " panels.append(\n", + " pn.Row('Assistant:', pn.pane.Markdown(response, width=600)))\n", + " \n", + " return pn.Column(*panels)\n", + "\n", + "\n", + "inp = pn.widgets.TextInput(value=\"Hi\", placeholder='Enter text here…')\n", + "button_conversation = pn.widgets.Button(name=\"Chat!\")\n", + "interactive_conversation = pn.bind(collect_messages, button_conversation)\n", + "dashboard = pn.Column(\n", + " inp,\n", + " pn.Row(button_conversation),\n", + " pn.panel(interactive_conversation, loading_indicator=True, height=300, width=300),\n", + ")\n", + "\n", + "dashboard" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/download_models.py b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/download_models.py new file mode 100644 index 000000000..69529726b --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/download_models.py @@ -0,0 +1,22 @@ +import sys +from huggingface_hub import snapshot_download + +# --- Model Download --- +if __name__ == "__main__": + # List your desired Hugging Face model names here + model_names = [ + "Qwen/Qwen3-Embedding-0.6B", + ] + + for model_name in model_names: + print(f"--- Downloading {model_name} ---") + try: + if len(sys.argv) > 1: + snapshot_download(repo_id=model_name, cache_dir=sys.argv[0]) + else: + snapshot_download(repo_id=model_name) + print(f"Successfully cached {model_name}") + except Exception as e: + print(f"Failed to download {model_name}. Error: {e}") + + print("--- Model download process finished. ---") diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ollama.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ollama.ipynb new file mode 100644 index 000000000..e51183a02 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ollama.ipynb @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "079fadd2-200e-4d37-8ae2-be2792e3a24e", + "metadata": {}, + "source": [ + "### Cell 1 - Install Ollama and verify environment\n", + "\n", + "Installs Ollama for local model serving, sets up environment variables, and verifies the installation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install ollama requests --disable-pip-version-check\n", + "\n", + "import os, subprocess, time, json, requests\n", + "from pathlib import Path\n", + "\n", + "os.environ['OLLAMA_HOST'] = os.getenv('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434')\n", + "MODEL_NAME = \"qwen2.5:1.5b\"\n", + "MLFLOW_URI = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "\n", + "OLLAMA_HOST = os.environ['OLLAMA_HOST']\n", + "\n", + "print(\"Environment Configuration:\")\n", + "print(\"Ollama Host:\", OLLAMA_HOST)\n", + "print(\"Model: \", MODEL_NAME)\n", + "print(\"MLflow: \", MLFLOW_URI)\n", + "print(\"-\" * 60)\n", + "\n", + "try:\n", + " r = requests.get(f\"{OLLAMA_HOST}/api/version\", timeout=5)\n", + " print(\"Ollama version:\", r.json())\n", + "except Exception as e:\n", + " print(\"Note: Ollama service not running. Starting it in next cell...\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe862173-fd9a-41ae-a27b-63875f788024", + "metadata": {}, + "source": [ + "### Cell 2 - Start Ollama service and pull model\n", + "\n", + "Starts the Ollama service if not running, pulls the Qwen 2.5 1.5B model, and verifies it's ready." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34da3e26-6276-48b7-b3ac-c90359df6547", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, time, requests, os\n", + "\n", + "OLLAMA_HOST = os.environ.get('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434')\n", + "MODEL_NAME = \"qwen2.5:1.5b\"\n", + "\n", + "def check_ollama():\n", + " try:\n", + " r = requests.get(f\"{OLLAMA_HOST}/api/tags\", timeout=2)\n", + " return r.status_code == 200\n", + " except:\n", + " return False\n", + "\n", + "if not check_ollama() and OLLAMA_HOST.startswith(\"http://ai-starter-kit-ollama\"):\n", + " print(\"Starting Ollama service...\")\n", + " try:\n", + " subprocess.Popen([\"ollama\", \"serve\"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n", + " time.sleep(3)\n", + " except Exception as e:\n", + " print(f\"Could not start Ollama automatically: {e}\")\n", + " print(\"Please start Ollama manually with: ollama serve\")\n", + "\n", + "if check_ollama():\n", + " print(\"Ollama service is running\")\n", + " \n", + " print(f\"\\nPulling model {MODEL_NAME}...\")\n", + " try:\n", + " r = requests.get(f\"{OLLAMA_HOST}/api/tags\")\n", + " models = r.json().get('models', [])\n", + " model_exists = any(m.get('name') == MODEL_NAME for m in models)\n", + " \n", + " if not model_exists:\n", + " pull_data = {\"name\": MODEL_NAME}\n", + " r = requests.post(f\"{OLLAMA_HOST}/api/pull\", json=pull_data, stream=True)\n", + " for line in r.iter_lines():\n", + " if line:\n", + " try:\n", + " status = json.loads(line)\n", + " if 'status' in status:\n", + " print(f\" {status['status']}\", end='\\r')\n", + " except:\n", + " pass\n", + " print(f\"\\nModel {MODEL_NAME} pulled successfully\")\n", + " else:\n", + " print(f\"Model {MODEL_NAME} already available\")\n", + " except Exception as e:\n", + " print(f\"Error pulling model: {e}\")\n", + "else:\n", + " print(\"Warning: Ollama service is not running\")\n", + " print(\"Please ensure Ollama is installed and running\")" + ] + }, + { + "cell_type": "markdown", + "id": "8111d705-595e-4e65-8479-bdc76191fa31", + "metadata": {}, + "source": [ + "### Cell 3 - Create OpenAI-compatible API wrapper\n", + "\n", + "Sets up a simple FastAPI server that wraps Ollama with an OpenAI-compatible API, including MLflow tracking." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbea1539-e9ab-460a-9cfc-20a42807f616", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install fastapi uvicorn mlflow --disable-pip-version-check\n", + "\n", + "import os, subprocess, time, json, requests, threading\n", + "from pathlib import Path\n", + "\n", + "import psutil\n", + "\n", + "api_wrapper_code = '''\n", + "import os, time, uuid, requests, json\n", + "from fastapi import FastAPI, Request\n", + "from fastapi.responses import JSONResponse\n", + "import uvicorn\n", + "\n", + "USE_MLFLOW = False\n", + "try:\n", + " import mlflow\n", + " mlflow_uri = os.getenv(\"MLFLOW_TRACKING_URI\")\n", + " if mlflow_uri:\n", + " mlflow.set_tracking_uri(mlflow_uri)\n", + " mlflow.set_experiment(\"ollama-llm\")\n", + " USE_MLFLOW = True\n", + "except:\n", + " pass\n", + "\n", + "app = FastAPI()\n", + "OLLAMA_HOST = os.getenv(\"OLLAMA_HOST\", \"http://ai-starter-kit-ollama:11434\")\n", + "MODEL_NAME = os.getenv(\"MODEL_NAME\", \"qwen2.5:1.5b\")\n", + "\n", + "@app.get(\"/v1/healthz\")\n", + "async def health():\n", + " return {\"status\": \"ok\", \"model\": MODEL_NAME}\n", + "\n", + "@app.post(\"/v1/chat/completions\")\n", + "async def chat_completions(request: Request):\n", + " t0 = time.time()\n", + " body = await request.json()\n", + " \n", + " messages = body.get(\"messages\", [])\n", + " temperature = body.get(\"temperature\", 0.7)\n", + " max_tokens = body.get(\"max_tokens\", 256)\n", + " \n", + " # Call Ollama API\n", + " ollama_payload = {\n", + " \"model\": MODEL_NAME,\n", + " \"messages\": messages,\n", + " \"stream\": False,\n", + " \"options\": {\n", + " \"temperature\": temperature,\n", + " \"num_predict\": max_tokens\n", + " }\n", + " }\n", + " \n", + " try:\n", + " r = requests.post(f\"{OLLAMA_HOST}/api/chat\", json=ollama_payload, timeout=120)\n", + " r.raise_for_status()\n", + " ollama_response = r.json()\n", + " \n", + " content = ollama_response.get(\"message\", {}).get(\"content\", \"\")\n", + " prompt_tokens = len(\" \".join(m.get(\"content\", \"\") for m in messages).split())\n", + " completion_tokens = len(content.split())\n", + " \n", + " if USE_MLFLOW:\n", + " try:\n", + " with mlflow.start_run():\n", + " mlflow.log_params({\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"model\": MODEL_NAME\n", + " })\n", + " mlflow.log_metrics({\n", + " \"duration_ms\": int((time.time() - t0) * 1000),\n", + " \"prompt_tokens_approx\": prompt_tokens,\n", + " \"completion_tokens_approx\": completion_tokens,\n", + " \"total_tokens_approx\": prompt_tokens + completion_tokens\n", + " })\n", + " except:\n", + " pass\n", + " \n", + " return {\n", + " \"id\": \"chatcmpl-\" + uuid.uuid4().hex[:8],\n", + " \"object\": \"chat.completion\",\n", + " \"created\": int(time.time()),\n", + " \"model\": MODEL_NAME,\n", + " \"choices\": [{\n", + " \"index\": 0,\n", + " \"message\": {\"role\": \"assistant\", \"content\": content},\n", + " \"finish_reason\": \"stop\"\n", + " }],\n", + " \"usage\": {\n", + " \"prompt_tokens\": prompt_tokens,\n", + " \"completion_tokens\": completion_tokens,\n", + " \"total_tokens\": prompt_tokens + completion_tokens\n", + " }\n", + " }\n", + " except Exception as e:\n", + " return JSONResponse(status_code=500, content={\"error\": str(e)})\n", + "\n", + "if __name__ == \"__main__\":\n", + " uvicorn.run(app, host=\"0.0.0.0\", port=8000)\n", + "'''\n", + "\n", + "with open('/tmp/ollama_wrapper.py', 'w') as f:\n", + " f.write(api_wrapper_code)\n", + "\n", + "print(\"Checking if wrapper script was written...\")\n", + "if not os.path.exists('/tmp/ollama_wrapper.py'):\n", + " print(\"ERROR: Failed to write wrapper script!\")\n", + " raise Exception(\"Cannot proceed without wrapper script\")\n", + "print(\" Wrapper script created\")\n", + "\n", + "print(\"\\nKilling existing wrapper processes...\")\n", + "proc_iter = psutil.process_iter(attrs=[\"pid\", \"name\", \"cmdline\"])\n", + "for p in proc_iter:\n", + " for arg in p.info[\"cmdline\"]:\n", + " if \"ollama_wrapper.py\" in arg:\n", + " p.kill()\n", + " print(f\"...Process {p.pid} has been killed\")\n", + " break\n", + "\n", + "time.sleep(2)\n", + "\n", + "log_file = '/tmp/wrapper.log'\n", + "print(f\"\\nPreparing log file: {log_file}\")\n", + "!touch /tmp/wrapper.log\n", + "!chmod 666 /tmp/wrapper.log\n", + "\n", + "if not os.path.exists(log_file):\n", + " print(f\"ERROR: Could not create log file at {log_file}\")\n", + " raise Exception(\"Cannot create log file\")\n", + "print(\" Log file ready\")\n", + "\n", + "env_vars = {\n", + " 'OLLAMA_HOST': os.getenv('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434'),\n", + " 'MODEL_NAME': 'qwen2.5:1.5b',\n", + " 'MLFLOW_TRACKING_URI': os.getenv('MLFLOW_TRACKING_URI', 'http://ai-starter-kit-mlflow:5000')\n", + "}\n", + "\n", + "print(\"\\nEnvironment variables:\")\n", + "for k, v in env_vars.items():\n", + " print(f\" {k}={v}\")\n", + "\n", + "print(\"\\nStarting API wrapper...\")\n", + "with open(log_file, 'w') as log:\n", + " process = subprocess.Popen(\n", + " ['python', '/tmp/ollama_wrapper.py'],\n", + " stdout=log,\n", + " stderr=subprocess.STDOUT,\n", + " env={**os.environ, **env_vars},\n", + " start_new_session=True\n", + " )\n", + " \n", + "print(f\"Process started with PID: {process.pid}\")\n", + "\n", + "time.sleep(2)\n", + "if process.poll() is not None:\n", + " print(f\"\\nERROR: Process died immediately with exit code {process.poll()}\")\n", + " print(\"\\nLog contents:\")\n", + " !cat /tmp/wrapper.log\n", + " raise Exception(\"API wrapper failed to start\")\n", + "print(\" Process is running\")\n", + "\n", + "print(\"\\nWaiting for API to respond...\")\n", + "api_ready = False\n", + "for i in range(30):\n", + " time.sleep(1)\n", + " try:\n", + " r = requests.get(\"http://localhost:8000/v1/healthz\", timeout=1)\n", + " if r.status_code == 200:\n", + " print(f\"\\n API is ready! Response: {r.json()}\")\n", + " print(f\"\\nOpenAI-compatible API running at: http://localhost:8000/v1\")\n", + " print(f\"Health: http://localhost:8000/v1/healthz\")\n", + " print(f\"Chat: http://localhost:8000/v1/chat/completions\")\n", + " api_ready = True\n", + " break\n", + " except requests.exceptions.ConnectionError:\n", + " if i % 5 == 0:\n", + " print(f\" Waiting for API... ({i}s)\")\n", + " except Exception as e:\n", + " print(f\" Unexpected error: {e}\")\n", + "\n", + "if not api_ready:\n", + " print(\"\\nAPI wrapper failed to start within 30 seconds\")\n", + " print(\"\\nChecking if process is still alive...\")\n", + " if process.poll() is not None:\n", + " print(f\"Process died with exit code: {process.poll()}\")\n", + " else:\n", + " print(\"Process is still running but not responding\")\n", + " \n", + " print(\"\\nLast 50 lines of logs:\")\n", + " !tail -50 /tmp/wrapper.log\n", + " \n", + " print(\"\\nChecking if port 8000 is in use:\")\n", + " !netstat -tlnp 2>/dev/null | grep 8000 || echo \"No process on port 8000\"\n", + " \n", + " print(\"\\nChecking Python processes:\")\n", + " !ps aux | grep python | grep -v grep" + ] + }, + { + "cell_type": "markdown", + "id": "a411c015-c802-4ca1-81bb-3f4790d9626a", + "metadata": {}, + "source": [ + "### Cell 4 - Basic client + latency test\n", + "\n", + "Tests the OpenAI-compatible API with a simple chat request and measures latency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3be634e2-a82f-42c9-8e31-57e6868a86ee", + "metadata": {}, + "outputs": [], + "source": [ + "import os, time, requests, json\n", + "\n", + "BASE_URL = \"http://localhost:8000/v1\"\n", + "OLLAMA_DIRECT = os.getenv(\"OLLAMA_HOST\", \"http://ai-starter-kit-ollama:11434\")\n", + "\n", + "try:\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=2)\n", + " USE_WRAPPER = r.status_code == 200\n", + " print(\"✓ Using: OpenAI-compatible wrapper\")\n", + "except:\n", + " USE_WRAPPER = False\n", + " print(\"✓ Using: Direct Ollama API (wrapper not available)\")\n", + "\n", + "def health():\n", + " if USE_WRAPPER:\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n", + " print(\"Health:\", r.status_code, r.json())\n", + " else:\n", + " r = requests.get(f\"{OLLAMA_DIRECT}/api/tags\", timeout=10)\n", + " print(\"Health:\", r.status_code, \"Models available:\", len(r.json().get('models', [])))\n", + "\n", + "def chat(prompt, temperature=0.4, max_tokens=220):\n", + " if USE_WRAPPER:\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant. Be concise.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ]\n", + " }\n", + " endpoint = f\"{BASE_URL}/chat/completions\"\n", + " else:\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant. Be concise.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " \"stream\": False,\n", + " \"options\": {\n", + " \"temperature\": temperature,\n", + " \"num_predict\": max_tokens\n", + " }\n", + " }\n", + " endpoint = f\"{OLLAMA_DIRECT}/api/chat\"\n", + " \n", + " t0 = time.time()\n", + " r = requests.post(endpoint, json=body, timeout=120)\n", + " dt = time.time() - t0\n", + " r.raise_for_status()\n", + " \n", + " if USE_WRAPPER:\n", + " response = r.json()\n", + " content = response[\"choices\"][0][\"message\"][\"content\"]\n", + " usage = response.get(\"usage\", {})\n", + " else:\n", + " response = r.json()\n", + " content = response.get(\"message\", {}).get(\"content\", \"\")\n", + " usage = {\"total_tokens\": \"estimated: \" + str(len(content.split()) + len(prompt.split()))}\n", + " \n", + " print(f\"\\nLatency: {dt:.2f}s | usage: {usage}\")\n", + " print(\"\\n---\\n\", content)\n", + " return content\n", + "\n", + "health()\n", + "_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\")" + ] + }, + { + "cell_type": "markdown", + "id": "553d2756-8949-43e3-8342-71387688e0fa", + "metadata": {}, + "source": [ + "### Cell 5 - Multi-agent pipeline\n", + "\n", + "Implements a simple three-agent workflow (Researcher -> Writer -> Critic) using the local LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1", + "metadata": {}, + "outputs": [], + "source": [ + "import os, requests, json, time\n", + "\n", + "BASE_URL = \"http://localhost:8000/v1\" \n", + "OLLAMA_DIRECT = os.getenv(\"OLLAMA_HOST\", \"http://ai-starter-kit-ollama:11434\")\n", + "\n", + "def call_llm(role_prompt, user_message, temperature=0.4, max_tokens=150, use_wrapper=True):\n", + " if use_wrapper:\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": role_prompt},\n", + " {\"role\": \"user\", \"content\": user_message}\n", + " ]\n", + " }\n", + " try:\n", + " r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=120)\n", + " r.raise_for_status()\n", + " return r.json()[\"choices\"][0][\"message\"][\"content\"]\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + " else:\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": role_prompt},\n", + " {\"role\": \"user\", \"content\": user_message}\n", + " ],\n", + " \"stream\": False,\n", + " \"options\": {\n", + " \"temperature\": temperature,\n", + " \"num_predict\": max_tokens\n", + " }\n", + " }\n", + " try:\n", + " r = requests.post(f\"{OLLAMA_DIRECT}/api/chat\", json=body, timeout=120)\n", + " r.raise_for_status()\n", + " return r.json().get(\"message\", {}).get(\"content\", \"\")\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "print(\"=\" * 60)\n", + "print(\"Running Multi-Agent Workflow with Ollama\")\n", + "print(\"=\" * 60)\n", + "\n", + "task = \"Research the latest advancements in quantum computing as of 2025.\"\n", + "\n", + "try:\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=2)\n", + " use_wrapper = r.status_code == 200\n", + " print(\"Using: OpenAI-compatible wrapper\\n\")\n", + "except:\n", + " use_wrapper = False\n", + " print(\"Using: Direct Ollama API\\n\")\n", + "\n", + "print(\"1. RESEARCHER:\")\n", + "print(\"-\" * 40)\n", + "research_prompt = \"You are a researcher. Provide 3-4 key facts about the topic. Be concise and factual.\"\n", + "research_notes = call_llm(research_prompt, task, temperature=0.35, max_tokens=140, use_wrapper=use_wrapper)\n", + "print(research_notes)\n", + "time.sleep(1)\n", + "\n", + "print(\"\\n2. WRITER:\")\n", + "print(\"-\" * 40)\n", + "writer_prompt = \"You are a technical writer. Based on the following notes, write a brief report.\"\n", + "writer_task = f\"Write a report based on these notes:\\n{research_notes}\"\n", + "report = call_llm(writer_prompt, writer_task, temperature=0.55, max_tokens=220, use_wrapper=use_wrapper)\n", + "print(report)\n", + "time.sleep(1)\n", + "\n", + "print(\"\\n3. CRITIC/EDITOR:\")\n", + "print(\"-\" * 40)\n", + "critic_prompt = \"You are an editor. Review the report and provide a final polished version.\"\n", + "critic_task = f\"Review and improve this report:\\n{report}\"\n", + "final_output = call_llm(critic_prompt, critic_task, temperature=0.45, max_tokens=160, use_wrapper=use_wrapper)\n", + "print(final_output)\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"Multi-agent workflow complete\")\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b", + "metadata": {}, + "source": [ + "### Cell 6 - MLFlow: connect to tracking server and list recent runs\n", + "\n", + "Connects to MLflow tracking server and displays recent model inference runs with metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install mlflow==2.14.3 --disable-pip-version-check\n", + "\n", + "import os, mlflow\n", + "from datetime import datetime\n", + "\n", + "tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "mlflow.set_tracking_uri(tracking_uri)\n", + "print(f\"MLflow Tracking URI: {tracking_uri}\")\n", + "\n", + "exp_name = \"ollama-llm\"\n", + "exp = mlflow.set_experiment(exp_name)\n", + "print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n", + "print(\"-\" * 60)\n", + "\n", + "client = mlflow.tracking.MlflowClient()\n", + "runs = client.search_runs(\n", + " exp.experiment_id,\n", + " order_by=[\"attributes.start_time DESC\"],\n", + " max_results=10\n", + ")\n", + "\n", + "if not runs:\n", + " print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n", + "else:\n", + " print(f\"\\nFound {len(runs)} recent runs:\")\n", + " print(\"-\" * 60)\n", + " \n", + " for i, run in enumerate(runs, 1):\n", + " start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n", + " duration = run.data.metrics.get('duration_ms', 'N/A')\n", + " temp = run.data.params.get('temperature', 'N/A')\n", + " max_tokens = run.data.params.get('max_tokens', 'N/A')\n", + " total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')\n", + " \n", + " print(f\"\\nRun {i}:\")\n", + " print(f\" ID: {run.info.run_id[:12]}...\")\n", + " print(f\" Time: {start_time}\")\n", + " print(f\" Status: {run.info.status}\")\n", + " print(f\" Temperature: {temp}\")\n", + " print(f\" Max Tokens: {max_tokens}\")\n", + " print(f\" Duration: {duration} ms\")\n", + " print(f\" Total Tokens: {total_tokens}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"SUMMARY:\")\n", + " successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n", + " durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n", + " avg_duration = sum(durations) / len(durations) if durations else 0\n", + " \n", + " print(f\" Total Runs: {len(runs)}\")\n", + " print(f\" Successful: {successful}\")\n", + " print(f\" Failed: {len(runs) - successful}\")\n", + " print(f\" Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \" Avg Duration: N/A\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"MLflow verification complete\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ramalama.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ramalama.ipynb new file mode 100644 index 000000000..7978d3380 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent-ramalama.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "079fadd2-200e-4d37-8ae2-be2792e3a24e", + "metadata": {}, + "source": [ + "### Cell 1 - Install RamaLama and verify environment\n", + "\n", + "Installs RamaLama for local model serving, sets up environment variables, and verifies the installation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install requests --disable-pip-version-check\n", + "\n", + "import os, time, json, requests\n", + "from pathlib import Path\n", + "\n", + "os.environ['RAMALAMA_HOST'] = 'http://ai-starter-kit-ramalama:8080'\n", + "MODEL_NAME = \"qwen2.5:1.5b\"\n", + "MLFLOW_URI = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "\n", + "RAMALAMA_HOST = os.environ['RAMALAMA_HOST']\n", + "\n", + "print(\"Environment Configuration:\")\n", + "print(\"RamaLama Host:\", RAMALAMA_HOST)\n", + "print(\"Model: \", MODEL_NAME)\n", + "print(\"MLflow: \", MLFLOW_URI)\n", + "print(\"-\" * 60)\n", + "\n", + "try:\n", + " r = requests.get(f\"{RAMALAMA_HOST}/v1/models\", timeout=5)\n", + " print(\"RamaLama models:\", r.json())\n", + "except Exception as e:\n", + " print(f\"Error connecting to RamaLama: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe862173-fd9a-41ae-a27b-63875f788024", + "metadata": {}, + "source": [ + "### Cell 2 - Start RamaLama service and pull model\n", + "\n", + "Starts the RamaLama service if not running, pulls the Qwen 2.5 1.5B model, and verifies it's ready." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34da3e26-6276-48b7-b3ac-c90359df6547", + "metadata": {}, + "outputs": [], + "source": [ + "import requests, os, json\n", + "RAMALAMA_HOST = os.environ.get('RAMALAMA_HOST', 'http://ai-starter-kit-ramalama:8080')\n", + "MODEL_NAME = \"qwen2.5:1.5b\"\n", + "\n", + "def check_ramalama():\n", + " try:\n", + " r = requests.get(f\"{RAMALAMA_HOST}/v1/models\", timeout=2)\n", + " return r.status_code == 200\n", + " except:\n", + " return False\n", + "\n", + "if check_ramalama():\n", + " print(\"RamaLama service is running\")\n", + " \n", + " print(f\"\\nChecking model {MODEL_NAME}...\")\n", + " try:\n", + " r = requests.get(f\"{RAMALAMA_HOST}/v1/models\")\n", + " models = r.json().get('data', [])\n", + " model_exists = any(m.get('id') == MODEL_NAME for m in models)\n", + " \n", + " if not model_exists:\n", + " print(f\"Pulling model {MODEL_NAME}...\")\n", + " \n", + " try:\n", + " test_body = {\n", + " \"model\": MODEL_NAME,\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"test\"}],\n", + " \"max_tokens\": 1\n", + " }\n", + " r = requests.post(f\"{RAMALAMA_HOST}/v1/chat/completions\", json=test_body, timeout=300)\n", + " \n", + " if r.status_code == 200:\n", + " print(f\"Model {MODEL_NAME} loaded successfully\")\n", + " else:\n", + " print(f\"Failed to load model. Status: {r.status_code}\")\n", + " print(\"You may need to pull the model manually in the RamaLama deployment\")\n", + " \n", + " except requests.exceptions.Timeout:\n", + " print(\"Model pull timed out. Large models may take longer.\")\n", + " print(\"Check RamaLama logs to monitor progress\")\n", + " except Exception as e:\n", + " print(f\"Error pulling model: {e}\")\n", + " print(\"You may need to pull the model manually in the RamaLama deployment\")\n", + " else:\n", + " print(f\"Model {MODEL_NAME} already available\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error checking model: {e}\")\n", + "else:\n", + " print(\"Warning: RamaLama service is not running\")\n", + " print(\"Please ensure the deployment is healthy\")" + ] + }, + { + "cell_type": "markdown", + "id": "8111d705-595e-4e65-8479-bdc76191fa31", + "metadata": {}, + "source": [ + "### Cell 3 - Create OpenAI-compatible API wrapper\n", + "\n", + "Sets up a simple FastAPI server that wraps RamaLama with an OpenAI-compatible API, including MLflow tracking. Since RamaLama already provides OpenAI compatibility, this acts as a proxy with logging." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbea1539-e9ab-460a-9cfc-20a42807f616", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install fastapi uvicorn mlflow --disable-pip-version-check\n", + "\n", + "import os, subprocess, time, json, requests\n", + "from pathlib import Path\n", + "\n", + "import psutil\n", + "\n", + "api_wrapper_code = '''\n", + "import os, time, uuid, requests, json\n", + "from fastapi import FastAPI, Request\n", + "from fastapi.responses import JSONResponse\n", + "import uvicorn\n", + "\n", + "USE_MLFLOW = False\n", + "try:\n", + " import mlflow\n", + " mlflow_uri = os.getenv(\"MLFLOW_TRACKING_URI\")\n", + " if mlflow_uri:\n", + " mlflow.set_tracking_uri(mlflow_uri)\n", + " mlflow.set_experiment(\"ramalama-llm\")\n", + " USE_MLFLOW = True\n", + "except:\n", + " pass\n", + "\n", + "app = FastAPI()\n", + "RAMALAMA_HOST = os.getenv(\"RAMALAMA_HOST\", \"http://ai-starter-kit-ramalama:8080\")\n", + "MODEL_NAME = os.getenv(\"MODEL_NAME\", \"qwen2.5:1.5b\")\n", + "\n", + "@app.get(\"/v1/healthz\")\n", + "async def health():\n", + " return {\"status\": \"ok\", \"model\": MODEL_NAME}\n", + "\n", + "@app.post(\"/v1/chat/completions\")\n", + "async def chat_completions(request: Request):\n", + " t0 = time.time()\n", + " body = await request.json()\n", + " \n", + " messages = body.get(\"messages\", [])\n", + " temperature = body.get(\"temperature\", 0.7)\n", + " max_tokens = body.get(\"max_tokens\", 256)\n", + " \n", + " payload = {\n", + " \"model\": MODEL_NAME,\n", + " \"messages\": messages,\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"stream\": False\n", + " }\n", + " \n", + " try:\n", + " r = requests.post(f\"{RAMALAMA_HOST}/v1/chat/completions\", json=payload, timeout=120)\n", + " r.raise_for_status()\n", + " response = r.json()\n", + " \n", + " content = response[\"choices\"][0][\"message\"][\"content\"]\n", + " usage = response.get(\"usage\", {})\n", + " prompt_tokens = usage.get(\"prompt_tokens\", len(\" \".join(m.get(\"content\", \"\") for m in messages).split()))\n", + " completion_tokens = usage.get(\"completion_tokens\", len(content.split()))\n", + " total_tokens = prompt_tokens + completion_tokens\n", + " \n", + " if USE_MLFLOW:\n", + " try:\n", + " with mlflow.start_run():\n", + " mlflow.log_params({\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"model\": MODEL_NAME\n", + " })\n", + " mlflow.log_metrics({\n", + " \"duration_ms\": int((time.time() - t0) * 1000),\n", + " \"prompt_tokens\": prompt_tokens,\n", + " \"completion_tokens\": completion_tokens,\n", + " \"total_tokens\": total_tokens\n", + " })\n", + " except:\n", + " pass\n", + " \n", + " return {\n", + " \"id\": \"chatcmpl-\" + uuid.uuid4().hex[:8],\n", + " \"object\": \"chat.completion\",\n", + " \"created\": int(time.time()),\n", + " \"model\": MODEL_NAME,\n", + " \"choices\": [{\n", + " \"index\": 0,\n", + " \"message\": {\"role\": \"assistant\", \"content\": content},\n", + " \"finish_reason\": \"stop\"\n", + " }],\n", + " \"usage\": {\n", + " \"prompt_tokens\": prompt_tokens,\n", + " \"completion_tokens\": completion_tokens,\n", + " \"total_tokens\": total_tokens\n", + " }\n", + " }\n", + " except Exception as e:\n", + " return JSONResponse(status_code=500, content={\"error\": str(e)})\n", + "\n", + "if __name__ == \"__main__\":\n", + " uvicorn.run(app, host=\"0.0.0.0\", port=8000)\n", + "'''\n", + "\n", + "with open('/tmp/ramalama_wrapper.py', 'w') as f:\n", + " f.write(api_wrapper_code)\n", + "\n", + "print(\"Wrapper script created\")\n", + "\n", + "print(\"Killing existing wrapper processes...\")\n", + "proc_iter = psutil.process_iter(attrs=[\"pid\", \"name\", \"cmdline\"])\n", + "for p in proc_iter:\n", + " for arg in p.info[\"cmdline\"]:\n", + " if \"ollama_wrapper.py\" in arg:\n", + " p.kill()\n", + " print(f\"...Process {p.pid} has been killed\")\n", + " break\n", + "\n", + "time.sleep(2)\n", + "\n", + "log_file = '/tmp/ramalama_wrapper.log'\n", + "!touch /tmp/ramalama_wrapper.log\n", + "!chmod 666 /tmp/ramalama_wrapper.log\n", + "print(\"Log file ready\")\n", + "\n", + "MODEL_NAME = \"qwen2.5:1.5b\"\n", + "MLFLOW_URI = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "RAMALAMA_HOST = os.getenv(\"RAMALAMA_HOST\", \"http://ai-starter-kit-ramalama:8080\")\n", + "\n", + "env_vars = {\n", + " 'RAMALAMA_HOST': RAMALAMA_HOST,\n", + " 'MODEL_NAME': MODEL_NAME,\n", + " 'MLFLOW_TRACKING_URI': MLFLOW_URI\n", + "}\n", + "\n", + "print(\"\\nEnvironment variables:\")\n", + "for k, v in env_vars.items():\n", + " print(f\" {k}={v}\")\n", + "\n", + "print(\"\\nStarting API wrapper...\")\n", + "with open(log_file, 'w') as log:\n", + " api_process = subprocess.Popen(\n", + " [\"python\", \"/tmp/ramalama_wrapper.py\"],\n", + " stdout=log,\n", + " stderr=subprocess.STDOUT,\n", + " env={**os.environ, **env_vars},\n", + " start_new_session=True\n", + " )\n", + "\n", + "print(f\"Process started with PID: {api_process.pid}\")\n", + "\n", + "time.sleep(2)\n", + "if api_process.poll() is not None:\n", + " print(f\"\\nERROR: Process died immediately with exit code {api_process.poll()}\")\n", + " print(\"\\nLog contents:\")\n", + " !cat /tmp/ramalama_wrapper.log\n", + " raise Exception(\"API wrapper failed to start\")\n", + "print(\"Process is running\")\n", + "\n", + "print(\"\\nWaiting for API to respond...\")\n", + "API_URL = \"http://localhost:8000\"\n", + "api_ready = False\n", + "\n", + "for i in range(30):\n", + " time.sleep(1)\n", + " try:\n", + " r = requests.get(f\"{API_URL}/v1/healthz\", timeout=1)\n", + " if r.status_code == 200:\n", + " print(f\"\\nAPI is ready! Response: {r.json()}\")\n", + " print(f\"\\nOpenAI-compatible API running at: {API_URL}/v1\")\n", + " print(f\"Health: {API_URL}/v1/healthz\")\n", + " print(f\"Chat: {API_URL}/v1/chat/completions\")\n", + " api_ready = True\n", + " break\n", + " except requests.exceptions.ConnectionError:\n", + " if i % 5 == 0:\n", + " print(f\" Waiting for API... ({i}s)\")\n", + " except Exception as e:\n", + " if i % 10 == 0:\n", + " print(f\" Unexpected error: {e}\")\n", + "\n", + "if not api_ready:\n", + " print(\"\\nAPI wrapper failed to start within 30 seconds\")\n", + " print(\"\\nChecking if process is still alive...\")\n", + " if api_process.poll() is not None:\n", + " print(f\"Process died with exit code: {api_process.poll()}\")\n", + " else:\n", + " print(\"Process is still running but not responding\")\n", + " \n", + " print(\"\\nLast 50 lines of logs:\")\n", + " !tail -50 /tmp/ramalama_wrapper.log\n", + " \n", + " print(\"\\nChecking if port 8000 is in use:\")\n", + " !netstat -tlnp 2>/dev/null | grep 8000 || echo \"No process on port 8000\"\n", + " \n", + " print(\"\\nNote: You can re-run this cell - the API might just need more time to start\")" + ] + }, + { + "cell_type": "markdown", + "id": "a411c015-c802-4ca1-81bb-3f4790d9626a", + "metadata": {}, + "source": [ + "### Cell 4 - Basic client + latency test\n", + "\n", + "Tests the OpenAI-compatible API with a simple chat request and measures latency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3be634e2-a82f-42c9-8e31-57e6868a86ee", + "metadata": {}, + "outputs": [], + "source": [ + "import os, time, requests, json\n", + "\n", + "USE_WRAPPER = True\n", + "BASE_URL = \"http://localhost:8000/v1\" if USE_WRAPPER else os.getenv(\"RAMALAMA_HOST\", \"http://127.0.0.1:8080\")\n", + "\n", + "def health():\n", + " if USE_WRAPPER:\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n", + " print(\"Health:\", r.status_code, r.json())\n", + " else:\n", + " r = requests.get(f\"{BASE_URL}/v1/models\", timeout=10)\n", + " print(\"Health:\", r.status_code, \"Models available:\", r.json().get('data', []))\n", + "\n", + "def chat(prompt, temperature=0.4, max_tokens=220):\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant. Be concise.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " \"stream\": False\n", + " }\n", + " endpoint = f\"{BASE_URL}/chat/completions\"\n", + " \n", + " t0 = time.time()\n", + " r = requests.post(endpoint, json=body, timeout=120)\n", + " dt = time.time() - t0\n", + " r.raise_for_status()\n", + " \n", + " response = r.json()\n", + " content = response[\"choices\"][0][\"message\"][\"content\"]\n", + " usage = response.get(\"usage\", {\"total_tokens\": \"estimated: \" + str(len(content.split()) + len(prompt.split()))})\n", + " \n", + " print(f\"\\nLatency: {dt:.2f}s | usage: {usage}\")\n", + " print(\"\\n---\\n\", content)\n", + " return content\n", + "\n", + "health()\n", + "_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\")" + ] + }, + { + "cell_type": "markdown", + "id": "553d2756-8949-43e3-8342-71387688e0fa", + "metadata": {}, + "source": [ + "### Cell 5 - Multi-agent pipeline\n", + "\n", + "Implements a simple three-agent workflow (Researcher -> Writer -> Critic) using the local LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1", + "metadata": {}, + "outputs": [], + "source": [ + "import os, requests, json, time\n", + "\n", + "BASE_URL = \"http://localhost:8000/v1\" \n", + "RAMALAMA_DIRECT = os.getenv(\"RAMALAMA_HOST\", \"http://127.0.0.1:8080\")\n", + "\n", + "def call_llm(role_prompt, user_message, temperature=0.4, max_tokens=150, use_wrapper=True):\n", + " body = {\n", + " \"model\": \"qwen2.5:1.5b\",\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": role_prompt},\n", + " {\"role\": \"user\", \"content\": user_message}\n", + " ],\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"stream\": False\n", + " }\n", + " if use_wrapper:\n", + " endpoint = f\"{BASE_URL}/chat/completions\"\n", + " else:\n", + " endpoint = f\"{RAMALAMA_DIRECT}/v1/chat/completions\"\n", + " try:\n", + " r = requests.post(endpoint, json=body, timeout=120)\n", + " r.raise_for_status()\n", + " response = r.json()\n", + " return response[\"choices\"][0][\"message\"][\"content\"]\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "print(\"=\" * 60)\n", + "print(\"Running Multi-Agent Workflow with RamaLama\")\n", + "print(\"=\" * 60)\n", + "\n", + "task = \"Research the latest advancements in quantum computing as of 2025.\"\n", + "\n", + "try:\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=2)\n", + " use_wrapper = r.status_code == 200\n", + " print(\"Using: OpenAI-compatible wrapper\\n\")\n", + "except:\n", + " use_wrapper = False\n", + " print(\"Using: Direct RamaLama API\\n\")\n", + "\n", + "print(\"RESEARCHER:\")\n", + "print(\"-\" * 40)\n", + "research_prompt = \"You are a researcher. Provide 3-4 key facts about the topic. Be concise and factual.\"\n", + "research_notes = call_llm(research_prompt, task, temperature=0.35, max_tokens=140, use_wrapper=use_wrapper)\n", + "print(research_notes)\n", + "time.sleep(1)\n", + "\n", + "print(\"\\nWRITER:\")\n", + "print(\"-\" * 40)\n", + "writer_prompt = \"You are a technical writer. Based on the following notes, write a brief report.\"\n", + "writer_task = f\"Write a report based on these notes:\\n{research_notes}\"\n", + "report = call_llm(writer_prompt, writer_task, temperature=0.55, max_tokens=220, use_wrapper=use_wrapper)\n", + "print(report)\n", + "time.sleep(1)\n", + "\n", + "print(\"\\nCRITIC/EDITOR:\")\n", + "print(\"-\" * 40)\n", + "critic_prompt = \"You are an editor. Review the report and provide a final polished version.\"\n", + "critic_task = f\"Review and improve this report:\\n{report}\"\n", + "final_output = call_llm(critic_prompt, critic_task, temperature=0.45, max_tokens=160, use_wrapper=use_wrapper)\n", + "print(final_output)\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"Multi-agent workflow complete\")\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b", + "metadata": {}, + "source": [ + "### Cell 6 - MLFlow: connect to tracking server and list recent runs\n", + "\n", + "Connects to MLflow tracking server and displays recent model inference runs with metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install mlflow==2.14.3 --disable-pip-version-check\n", + "\n", + "import os, mlflow\n", + "from datetime import datetime\n", + "\n", + "tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "mlflow.set_tracking_uri(tracking_uri)\n", + "print(f\"MLflow Tracking URI: {tracking_uri}\")\n", + "\n", + "exp_name = \"ramalama-llm\"\n", + "exp = mlflow.set_experiment(exp_name)\n", + "print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n", + "print(\"-\" * 60)\n", + "\n", + "client = mlflow.tracking.MlflowClient()\n", + "runs = client.search_runs(\n", + " exp.experiment_id,\n", + " order_by=[\"attributes.start_time DESC\"],\n", + " max_results=10\n", + ")\n", + "\n", + "if not runs:\n", + " print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n", + "else:\n", + " print(f\"\\nFound {len(runs)} recent runs:\")\n", + " print(\"-\" * 60)\n", + " \n", + " for i, run in enumerate(runs, 1):\n", + " start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n", + " duration = run.data.metrics.get('duration_ms', 'N/A')\n", + " temp = run.data.params.get('temperature', 'N/A')\n", + " max_tokens = run.data.params.get('max_tokens', 'N/A')\n", + " total_tokens = run.data.metrics.get('total_tokens', 'N/A')\n", + " \n", + " print(f\"\\nRun {i}:\")\n", + " print(f\" ID: {run.info.run_id[:12]}...\")\n", + " print(f\" Time: {start_time}\")\n", + " print(f\" Status: {run.info.status}\")\n", + " print(f\" Temperature: {temp}\")\n", + " print(f\" Max Tokens: {max_tokens}\")\n", + " print(f\" Duration: {duration} ms\")\n", + " print(f\" Total Tokens: {total_tokens}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"SUMMARY:\")\n", + " successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n", + " durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n", + " avg_duration = sum(durations) / len(durations) if durations else 0\n", + " \n", + " print(f\" Total Runs: {len(runs)}\")\n", + " print(f\" Successful: {successful}\")\n", + " print(f\" Failed: {len(runs) - successful}\")\n", + " print(f\" Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \" Avg Duration: N/A\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"MLflow verification complete\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent.ipynb new file mode 100644 index 000000000..23189a639 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/multi-agent.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "079fadd2-200e-4d37-8ae2-be2792e3a24e", + "metadata": {}, + "source": [ + "### Cell 1 - Initialize Ray endpoints and verify dashboard\n", + "\n", + "Installs requests, derives the Ray head host from RAY_ADDRESS, builds Dashboard/Serve/MLflow URLs, reads an Hugging Face token, and prints the endpoints plus the Jobs API version for a quick health check." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install requests==2.* --disable-pip-version-check\n", + "\n", + "import os, textwrap, base64, time, json, requests\n", + "from string import Template\n", + "\n", + "raw_addr = os.getenv(\"RAY_ADDRESS\", \"ray://ai-starter-kit-kuberay-head-svc:10001\")\n", + "if raw_addr.startswith(\"ray://\"):\n", + " HEAD_HOST = raw_addr.split(\"://\", 1)[1].split(\":\", 1)[0]\n", + "else:\n", + " HEAD_HOST = raw_addr.split(\":\", 1)[0] or \"ai-starter-kit-kuberay-head-svc\"\n", + "\n", + "DASH_URL = f\"http://{HEAD_HOST}:8265\"\n", + "SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"8000\"))\n", + "SERVE_ROUTE = \"/v1\"\n", + "\n", + "HF_TOKEN_PATH = \"/etc/secrets/huggingface/token\"\n", + "HF_TOKEN = \"\"\n", + "if os.path.exists(HF_TOKEN_PATH):\n", + " try:\n", + " HF_TOKEN = open(HF_TOKEN_PATH).read().strip()\n", + " except Exception:\n", + " HF_TOKEN = \"\"\n", + "\n", + "print(\"Head host:\", HEAD_HOST)\n", + "print(\"Jobs API :\", f\"{DASH_URL}/api/jobs/\")\n", + "print(\"Serve URL:\", f\"http://{HEAD_HOST}:{SERVE_PORT}{SERVE_ROUTE}\")\n", + "print(\"MLflow :\", os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\"))\n", + "\n", + "print(\"Jobs API version:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n" + ] + }, + { + "cell_type": "markdown", + "id": "fe862173-fd9a-41ae-a27b-63875f788024", + "metadata": {}, + "source": [ + "### Cell 2 - Deploy a minimal Ray Serve smoke test and verify readiness\n", + "\n", + "Submits a tiny FastAPI app to Ray Serve (one /healthz endpoint under /smoke) as a Ray Job, installing FastAPI on the fly. It polls the Jobs API for status and hits :8000/smoke/healthz up to 60 seconds, printing when the service responds 200 (i.e., smoke test passes)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34da3e26-6276-48b7-b3ac-c90359df6547", + "metadata": {}, + "outputs": [], + "source": [ + "import os, base64, textwrap, time, requests\n", + "\n", + "DASH_URL = \"http://ai-starter-kit-kuberay-head-svc:8265\"\n", + "\n", + "print(\"Jobs API:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n", + "\n", + "serve_py = textwrap.dedent(\"\"\"\n", + " from fastapi import FastAPI\n", + " from ray import serve\n", + " serve.start(detached=True, http_options={\"host\":\"0.0.0.0\",\"port\":8000})\n", + " app = FastAPI()\n", + "\n", + " @serve.deployment(name=\"smoke\", num_replicas=1)\n", + " @serve.ingress(app)\n", + " class Smoke:\n", + " @app.get(\"/healthz\")\n", + " async def health(self): return {\"ok\": True}\n", + "\n", + " serve.run(Smoke.bind(), route_prefix=\"/smoke\")\n", + " print(\"READY: smoke\", flush=True)\n", + "\"\"\").strip()\n", + "\n", + "b64 = base64.b64encode(serve_py.encode()).decode()\n", + "entry = f'python -c \"import base64; exec(base64.b64decode(\\'{b64}\\'))\"'\n", + "submit = requests.post(f\"{DASH_URL}/api/jobs/\", json={\"entrypoint\": entry, \"runtime_env\": {\"pip\": [\"fastapi>=0.110\"]}}, timeout=60).json()\n", + "job_id = submit[\"job_id\"]\n", + "print(\"Job:\", job_id)\n", + "\n", + "svc = \"http://ai-starter-kit-kuberay-head-svc:8000/smoke/healthz\"\n", + "for i in range(60):\n", + " s = requests.get(f\"{DASH_URL}/api/jobs/{job_id}\", timeout=10).json()[\"status\"]\n", + " try:\n", + " r = requests.get(svc, timeout=2)\n", + " print(f\"tick {i:02d}: job={s}, health={r.status_code}\")\n", + " if r.status_code == 200:\n", + " print(\"Smoke OK\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"tick {i:02d}: job={s}, health=ERR {e}\")\n", + " time.sleep(1)" + ] + }, + { + "cell_type": "markdown", + "id": "8111d705-595e-4e65-8479-bdc76191fa31", + "metadata": {}, + "source": [ + "### Cell 3 - Deploy model on Ray Serve with llama-cpp\n", + "\n", + "Packages and submits a Ray Job that spins up a Ray Serve app exposing /v1/healthz and /v1/chat/completions. It downloads the preferred GGUF from Hugging Face, initializes llama-cpp-python, logs to MLflow, and prints the deployed health/chat URLs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbea1539-e9ab-460a-9cfc-20a42807f616", + "metadata": {}, + "outputs": [], + "source": [ + "import os, base64, textwrap, requests\n", + "\n", + "HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n", + "DASH_URL = f\"http://{HEAD}:8265\"\n", + "SERVE_PORT = 8000\n", + "SERVE_ROUTE = \"/v1\"\n", + "\n", + "runtime_env = {\n", + " \"pip\": [\n", + " \"fastapi==0.110.0\",\n", + " \"uvicorn==0.23.2\",\n", + " \"huggingface_hub==0.25.2\",\n", + " \"llama-cpp-python==0.3.16\", \n", + " \"hf_transfer==0.1.6\",\n", + " \"mlflow==2.14.3\", \n", + " ],\n", + " \"env_vars\": {\n", + " \"HF_HUB_ENABLE_HF_TRANSFER\": \"1\",\n", + " \"HUGGINGFACE_HUB_TOKEN\": os.environ.get(\"HUGGINGFACE_HUB_TOKEN\", \"\"),\n", + " \"SERVE_PORT\": str(SERVE_PORT),\n", + "\n", + " \"MODEL_REPO\": \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\",\n", + " \"GGUF_PREF_ORDER\": \"q4_k_m,q4_0,q3_k_m,q2_k\",\n", + "\n", + " \"LLM_CONTEXT\": os.environ.get(\"LLM_CONTEXT\", \"1024\"),\n", + " \"LLM_MAX_TOKENS\": os.environ.get(\"LLM_MAX_TOKENS\", \"256\"),\n", + " \"SERVER_MAX_NEW_TOKENS\": os.environ.get(\"SERVER_MAX_NEW_TOKENS\", \"512\"),\n", + "\n", + " \"LLM_THREADS\": os.environ.get(\"LLM_THREADS\", \"6\"),\n", + " \"OMP_NUM_THREADS\": os.environ.get(\"OMP_NUM_THREADS\", \"6\"),\n", + " \"GPU_LAYERS\": \"0\", \n", + " \n", + " \"PIP_PREFER_BINARY\": \"1\",\n", + " \"CMAKE_ARGS\": \"-DGGML_OPENMP=OFF -DLLAMA_NATIVE=OFF\",\n", + "\n", + " \"HF_HOME\": \"/tmp/hf-cache\",\n", + " \"TRANSFORMERS_CACHE\": \"/tmp/hf-cache\",\n", + "\n", + " \"MLFLOW_TRACKING_URI\": os.environ.get(\"MLFLOW_TRACKING_URI\", \"\"),\n", + " \"MLFLOW_EXPERIMENT_NAME\": os.environ.get(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\"),\n", + " },\n", + "}\n", + "\n", + "serve_py = textwrap.dedent(f\"\"\"\n", + "import os, time, multiprocessing, uuid\n", + "from typing import List, Dict, Any\n", + "from fastapi import FastAPI, Request\n", + "from fastapi.responses import JSONResponse\n", + "from huggingface_hub import HfApi, hf_hub_download\n", + "from ray import serve\n", + "from llama_cpp import Llama\n", + "\n", + "USE_MLFLOW = False\n", + "try:\n", + " import mlflow\n", + " if os.getenv(\"MLFLOW_TRACKING_URI\"):\n", + " mlflow.set_tracking_uri(os.getenv(\"MLFLOW_TRACKING_URI\"))\n", + " mlflow.set_experiment(os.getenv(\"MLFLOW_EXPERIMENT_NAME\",\"ray-llama-cpp\"))\n", + " USE_MLFLOW = True\n", + "except Exception as _e:\n", + " USE_MLFLOW = False\n", + "\n", + "SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"{SERVE_PORT}\"))\n", + "SERVE_ROUTE = \"{SERVE_ROUTE}\"\n", + "MODEL_REPO = os.getenv(\"MODEL_REPO\", \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\")\n", + "GGUF_PREFS = [s.strip() for s in os.getenv(\"GGUF_PREF_ORDER\",\"q4_k_m,q4_0,q3_k_m,q2_k\").split(\",\") if s.strip()]\n", + "CTX_LEN = int(os.getenv(\"LLM_CONTEXT\", \"2048\"))\n", + "MAX_TOKENS = int(os.getenv(\"LLM_MAX_TOKENS\", \"256\"))\n", + "HF_TOKEN = os.getenv(\"HUGGINGFACE_HUB_TOKEN\") or None\n", + "\n", + "serve.start(detached=True, http_options={{\"host\":\"0.0.0.0\", \"port\":SERVE_PORT}})\n", + "app = FastAPI()\n", + "\n", + "def pick_one_file(repo_id: str, prefs):\n", + " api = HfApi()\n", + " files = api.list_repo_files(repo_id=repo_id, repo_type=\"model\", token=HF_TOKEN)\n", + " ggufs = [f for f in files if f.lower().endswith(\".gguf\")]\n", + " if not ggufs:\n", + " raise RuntimeError(f\"No .gguf files visible in {{repo_id}}\")\n", + " for pref in prefs:\n", + " for f in ggufs:\n", + " if pref.lower() in f.lower():\n", + " return f\n", + " return ggufs[0]\n", + "\n", + "def pick_chat_format(repo: str, fname: str) -> str:\n", + " return \"qwen\"\n", + "\n", + "@serve.deployment(name=\"qwen\", num_replicas=1, ray_actor_options={{\"num_cpus\": 6}})\n", + "@serve.ingress(app)\n", + "class OpenAICompatLlama:\n", + " def __init__(self, repo_id: str = MODEL_REPO):\n", + " target = pick_one_file(repo_id, GGUF_PREFS)\n", + " print(f\"[env] model repo: {{repo_id}} file: {{target}}\", flush=True)\n", + " local_dir = \"/tmp/hf-gguf\"; os.makedirs(local_dir, exist_ok=True)\n", + "\n", + " gguf_path = hf_hub_download(\n", + " repo_id=repo_id, filename=target, token=HF_TOKEN,\n", + " local_dir=local_dir, local_dir_use_symlinks=False,\n", + " force_download=False, resume_download=True\n", + " )\n", + " print(f\"[download] done: {{gguf_path}}\", flush=True)\n", + "\n", + " n_threads = int(os.getenv(\"LLM_THREADS\", max(2, (multiprocessing.cpu_count() or 4)//2)))\n", + " print(f\"[load] llama-cpp-python | ctx={{CTX_LEN}} threads={{n_threads}} gpu_layers={{int(os.getenv('GPU_LAYERS','0'))}}\", flush=True)\n", + "\n", + " self.model_file = os.path.basename(gguf_path)\n", + " self.model_repo = repo_id\n", + " chat_format = pick_chat_format(self.model_repo, self.model_file)\n", + " print(f\"[load] chat_format={{chat_format}}\", flush=True)\n", + "\n", + " self.llm = Llama(\n", + " model_path=gguf_path,\n", + " n_ctx=CTX_LEN,\n", + " n_threads=n_threads,\n", + " n_batch=256, \n", + " n_gpu_layers=int(os.getenv(\"GPU_LAYERS\",\"0\")),\n", + " chat_format=chat_format,\n", + " verbose=False\n", + " )\n", + " print(\"[ready] model loaded\", flush=True)\n", + "\n", + " @app.get(\"/healthz\")\n", + " async def health(self):\n", + " return {{\"status\":\"ok\"}}\n", + "\n", + " @app.post(\"/chat/completions\")\n", + " async def chat_completions(self, request: Request):\n", + " t0 = time.time()\n", + " body = await request.json()\n", + "\n", + " messages = body.get(\"messages\", [])\n", + " temperature = float(body.get(\"temperature\", 0.2))\n", + " req_max = body.get(\"max_tokens\", None)\n", + " stop_words = (body.get(\"stop\", []) or []) + [\"<|im_end|>\", \"\"]\n", + "\n", + " SERVER_MAX = int(os.getenv(\"SERVER_MAX_NEW_TOKENS\", \"512\"))\n", + " max_tokens = int(req_max if isinstance(req_max, int) else MAX_TOKENS)\n", + " max_tokens = max(32, min(max_tokens, CTX_LEN - 128, SERVER_MAX))\n", + "\n", + " rid = \"chatcmpl-\" + uuid.uuid4().hex[:24]\n", + " created = int(time.time())\n", + " model_name = f\"{{self.model_repo}}/{{self.model_file}}\"\n", + "\n", + " try:\n", + " result = self.llm.create_chat_completion(\n", + " messages=messages,\n", + " temperature=temperature,\n", + " max_tokens=max_tokens,\n", + " top_k=50,\n", + " top_p=0.9,\n", + " repeat_penalty=1.1,\n", + " stop=stop_words,\n", + " )\n", + " out_text = (result[\"choices\"][0][\"message\"][\"content\"] or \"\").strip()\n", + " usage_raw = result.get(\"usage\") or {{}}\n", + " p_tokens = int(usage_raw.get(\"prompt_tokens\") or 0)\n", + " c_tokens = int(usage_raw.get(\"completion_tokens\") or 0)\n", + " err = None\n", + " except Exception as e:\n", + " out_text = \"\"\n", + " p_tokens = c_tokens = 0\n", + " err = str(e)\n", + "\n", + " if USE_MLFLOW:\n", + " try:\n", + " dur_ms = int((time.time()-t0) * 1000)\n", + " with mlflow.start_run(run_name=\"chat\"):\n", + " mlflow.set_tags({{\n", + " \"model_repo\": self.model_repo,\n", + " \"model_file\": self.model_file,\n", + " \"framework\": \"llama-cpp-python\",\n", + " }})\n", + " mlflow.log_params({{\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"ctx\": CTX_LEN,\n", + " }})\n", + " if not (p_tokens and c_tokens):\n", + " p_tokens = p_tokens or max(1, len(\" \".join(m.get(\"content\",\"\") for m in messages).split()))\n", + " c_tokens = c_tokens or max(0, len(out_text.split()))\n", + " mlflow.log_metrics({{\n", + " \"duration_ms\": dur_ms,\n", + " \"prompt_tokens_approx\": p_tokens,\n", + " \"completion_tokens_approx\": c_tokens,\n", + " \"total_tokens_approx\": p_tokens + c_tokens,\n", + " }})\n", + " except Exception:\n", + " pass\n", + "\n", + " if err:\n", + " return JSONResponse(status_code=500, content={{\"error\": err, \"type\":\"generation_error\"}})\n", + "\n", + " usage = {{\n", + " \"prompt_tokens\": p_tokens,\n", + " \"completion_tokens\": c_tokens,\n", + " \"total_tokens\": p_tokens + c_tokens,\n", + " }}\n", + " return {{\n", + " \"id\": rid,\n", + " \"object\": \"chat.completion\",\n", + " \"created\": created,\n", + " \"model\": model_name,\n", + " \"choices\": [\n", + " {{\n", + " \"index\": 0,\n", + " \"message\": {{\"role\":\"assistant\",\"content\": out_text}},\n", + " \"finish_reason\": \"stop\"\n", + " }}\n", + " ],\n", + " \"usage\": usage\n", + " }}\n", + "\n", + "serve.run(OpenAICompatLlama.bind(), route_prefix=SERVE_ROUTE)\n", + "print(\"READY\", flush=True)\n", + "\"\"\").strip()\n", + "\n", + "payload = base64.b64encode(serve_py.encode()).decode()\n", + "entrypoint = 'python -c \"import base64,sys;exec(base64.b64decode(\\'{}\\').decode())\"'.format(payload)\n", + "\n", + "job = requests.post(\n", + " f\"{DASH_URL}/api/jobs/\",\n", + " json={\n", + " \"entrypoint\": entrypoint,\n", + " \"runtime_env\": runtime_env,\n", + " \"metadata\": {\"job_name\": \"serve-qwen2_5-llama_cpp-openai\"},\n", + " },\n", + " timeout=45\n", + ").json()\n", + "\n", + "print(\"Job:\", job.get(\"job_id\"))\n", + "print(\"Health:\", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/healthz\")\n", + "print(\"Chat: \", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/chat/completions\")" + ] + }, + { + "cell_type": "markdown", + "id": "a411c015-c802-4ca1-81bb-3f4790d9626a", + "metadata": {}, + "source": [ + "### Cell 4 - Basic client + latency test\n", + "\n", + "Calls /v1/healthz and then sends an OpenAI-style chat request to /v1/chat/completions with a short prompt. Prints latency and token usage, returning the assistant text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3be634e2-a82f-42c9-8e31-57e6868a86ee", + "metadata": {}, + "outputs": [], + "source": [ + "import os, time, requests, json\n", + "\n", + "HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n", + "SERVE_PORT = 8000\n", + "BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\"\n", + "\n", + "def health():\n", + " r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n", + " print(\"Health:\", r.status_code, r.json())\n", + "\n", + "def chat(prompt, temperature=0.4, max_tokens=220, stop=None):\n", + " body = {\n", + " \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n", + " \"temperature\": float(temperature),\n", + " \"max_tokens\": int(max_tokens),\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are Qwen2.5 Instruct running on a tiny CPU host. Be concise, complete sentences.\"},\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " }\n", + " if stop:\n", + " body[\"stop\"] = stop\n", + "\n", + " t0 = time.time()\n", + " r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=300)\n", + " dt = time.time() - t0\n", + " r.raise_for_status()\n", + " out = r.json()[\"choices\"][0][\"message\"][\"content\"]\n", + " usage = r.json().get(\"usage\", {})\n", + " print(f\"\\nLatency: {dt:.2f}s | usage: {usage}\")\n", + " print(\"\\n---\\n\", out)\n", + " return out\n", + "\n", + "health()\n", + "_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\", stop=[\"<|im_end|>\"])" + ] + }, + { + "cell_type": "markdown", + "id": "553d2756-8949-43e3-8342-71387688e0fa", + "metadata": {}, + "source": [ + "### Cell 5 - Multi-agent (Autogen) pipeline\n", + "\n", + "Installs Autogen, configures OpenAIWrapper to hit Ray Serve /v1 endpoint, warms up the model, then runs a simple three-agent workflow (Researcher -> Writer -> Critic) to produce and refine a short report." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1", + "metadata": {}, + "outputs": [], + "source": [ + "import os, requests, json, time\n", + "\n", + "HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n", + "SERVE_PORT = 8000\n", + "BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\"\n", + "\n", + "def call_llm(role_prompt, user_message, temperature=0.4, max_tokens=150):\n", + " body = {\n", + " \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n", + " \"temperature\": temperature,\n", + " \"max_tokens\": max_tokens,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": role_prompt},\n", + " {\"role\": \"user\", \"content\": user_message}\n", + " ]\n", + " }\n", + " try:\n", + " r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=120)\n", + " r.raise_for_status()\n", + " return r.json()[\"choices\"][0][\"message\"][\"content\"]\n", + " except Exception as e:\n", + " return f\"Error: {e}\"\n", + "\n", + "# Try to use autogen if available, otherwise use direct implementation\n", + "USE_AUTOGEN = False\n", + "\n", + "try:\n", + " import autogen\n", + " from autogen import AssistantAgent, UserProxyAgent\n", + " USE_AUTOGEN = True\n", + " print(\"Using autogen for multi-agent workflow\")\n", + "except ImportError:\n", + " try:\n", + " print(\"Installing autogen dependencies...\")\n", + " !pip install -q pyautogen~=0.2.35 python-dotenv tiktoken \"numpy<2,>=1.17.0\" --disable-pip-version-check 2>/dev/null\n", + " import autogen\n", + " from autogen import AssistantAgent, UserProxyAgent\n", + " USE_AUTOGEN = True\n", + " print(\"Autogen installed successfully\")\n", + " except:\n", + " print(\"Using direct implementation (autogen not available)\")\n", + " USE_AUTOGEN = False\n", + "\n", + "if USE_AUTOGEN:\n", + " config_list = [\n", + " {\n", + " \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n", + " \"base_url\": BASE_URL,\n", + " \"api_key\": \"local\",\n", + " \"price\": [0.0, 0.0],\n", + " }\n", + " ]\n", + " \n", + " llm = autogen.OpenAIWrapper(config_list=config_list)\n", + " \n", + " try:\n", + " r = llm.create(messages=[{\"role\":\"user\",\"content\":\"Say 'test ok'.\"}], temperature=0.2, max_tokens=16)\n", + " print(\"Warmup:\", r.choices[0].message.content)\n", + " except Exception as e:\n", + " print(\"Warmup skipped:\", e)\n", + " \n", + " user_proxy = UserProxyAgent(\n", + " name=\"UserProxy\",\n", + " system_message=\"You are the human admin. Initiate the task.\",\n", + " code_execution_config=False,\n", + " human_input_mode=\"NEVER\",\n", + " )\n", + " \n", + " researcher = AssistantAgent(\n", + " name=\"Researcher\",\n", + " system_message=(\n", + " \"You are a researcher. Gather concise, verified facts on the topic. \"\n", + " \"Return 3-4 bullet points. Keep under 100 words total.\"\n", + " ),\n", + " llm_config={\"config_list\": config_list, \"temperature\": 0.35, \"max_tokens\": 140, \"timeout\": 120},\n", + " )\n", + " \n", + " writer = AssistantAgent(\n", + " name=\"Writer\",\n", + " system_message=(\n", + " \"You are a writer. Using the Researcher's notes, produce a clear report under 160 words.\"\n", + " ),\n", + " llm_config={\"config_list\": config_list, \"temperature\": 0.55, \"max_tokens\": 220, \"timeout\": 180},\n", + " )\n", + " \n", + " critic = AssistantAgent(\n", + " name=\"Critic\",\n", + " system_message=(\n", + " \"You are a critic. Review the Writer's report for accuracy and clarity. \"\n", + " \"Present the final polished text under 140 words.\"\n", + " ),\n", + " llm_config={\"config_list\": config_list, \"temperature\": 0.45, \"max_tokens\": 160, \"timeout\": 120},\n", + " )\n", + " \n", + " def run_sequential(task):\n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"Running Multi-Agent Workflow (with autogen)\")\n", + " print(\"=\" * 60)\n", + " \n", + " research_response = researcher.generate_reply(messages=[{\"content\": task, \"role\": \"user\"}])\n", + " research_notes = research_response if isinstance(research_response, str) else research_response.get(\"content\", \"[no output]\")\n", + " print(\"\\n1. RESEARCHER:\")\n", + " print(\"-\" * 40)\n", + " print(research_notes)\n", + " \n", + " writer_prompt = f\"Using these research notes, write the report:\\n{research_notes}\"\n", + " writer_response = writer.generate_reply(messages=[{\"content\": writer_prompt, \"role\": \"user\"}])\n", + " report = writer_response if isinstance(writer_response, str) else writer_response.get(\"content\", \"[no output]\")\n", + " print(\"\\n2. WRITER:\")\n", + " print(\"-\" * 40)\n", + " print(report)\n", + " \n", + " critic_prompt = f\"Review this report:\\n{report}\"\n", + " critic_response = critic.generate_reply(messages=[{\"content\": critic_prompt, \"role\": \"user\"}])\n", + " final_text = critic_response if isinstance(critic_response, str) else critic_response.get(\"content\", \"[no output]\")\n", + " print(\"\\n3. CRITIC/EDITOR:\")\n", + " print(\"-\" * 40)\n", + " print(final_text)\n", + " return final_text\n", + " \n", + " task = \"Research the latest advancements in quantum computing as of 2025. Gather key facts, then write a short report.\"\n", + " final_output = run_sequential(task)\n", + " \n", + "else:\n", + " print(\"=\" * 60)\n", + " print(\"Running Multi-Agent Workflow (direct implementation)\")\n", + " print(\"=\" * 60)\n", + " \n", + " task = \"Research the latest advancements in quantum computing as of 2025.\"\n", + " \n", + " print(\"\\n1. RESEARCHER:\")\n", + " print(\"-\" * 40)\n", + " research_prompt = \"You are a researcher. Provide 3-4 key facts about the topic. Be concise and factual.\"\n", + " research_notes = call_llm(research_prompt, task, temperature=0.35, max_tokens=140)\n", + " print(research_notes)\n", + " time.sleep(1) \n", + " \n", + " print(\"\\n2. WRITER:\")\n", + " print(\"-\" * 40)\n", + " writer_prompt = \"You are a technical writer. Based on the following notes, write a brief report.\"\n", + " writer_task = f\"Write a report based on these notes:\\n{research_notes}\"\n", + " report = call_llm(writer_prompt, writer_task, temperature=0.55, max_tokens=220)\n", + " print(report)\n", + " time.sleep(1)\n", + " \n", + " print(\"\\n3. CRITIC/EDITOR:\")\n", + " print(\"-\" * 40)\n", + " critic_prompt = \"You are an editor. Review the report and provide a final polished version.\"\n", + " critic_task = f\"Review and improve this report:\\n{report}\"\n", + " final_output = call_llm(critic_prompt, critic_task, temperature=0.45, max_tokens=160)\n", + " print(final_output)\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"Multi-agent workflow complete\")\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b", + "metadata": {}, + "source": [ + "### Cell 6 - MLFlow: connect to tracking server and list recent chat runs\n", + "\n", + "Installs MLflow, sets the tracking URI and experiment, then queries and prints the latest runs with key params/metrics (temperature, max_tokens, duration) to verify Serve logging." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d", + "metadata": {}, + "outputs": [], + "source": [ + "!pip -q install mlflow==2.14.3 --disable-pip-version-check\n", + "\n", + "import os, mlflow\n", + "from datetime import datetime\n", + "\n", + "tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n", + "mlflow.set_tracking_uri(tracking_uri)\n", + "print(f\"MLflow Tracking URI: {tracking_uri}\")\n", + "\n", + "exp_name = os.getenv(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\")\n", + "exp = mlflow.set_experiment(exp_name)\n", + "print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n", + "print(\"-\" * 60)\n", + "\n", + "client = mlflow.tracking.MlflowClient()\n", + "runs = client.search_runs(\n", + " exp.experiment_id, \n", + " order_by=[\"attributes.start_time DESC\"], \n", + " max_results=10\n", + ")\n", + "\n", + "if not runs:\n", + " print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n", + "else:\n", + " print(f\"\\nFound {len(runs)} recent runs:\")\n", + " print(\"-\" * 60)\n", + " \n", + " for i, run in enumerate(runs, 1):\n", + " start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n", + " duration = run.data.metrics.get('duration_ms', 'N/A')\n", + " temp = run.data.params.get('temperature', 'N/A')\n", + " max_tokens = run.data.params.get('max_tokens', 'N/A')\n", + " total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')\n", + " \n", + " print(f\"\\nRun {i}:\")\n", + " print(f\" ID: {run.info.run_id[:12]}...\")\n", + " print(f\" Time: {start_time}\")\n", + " print(f\" Status: {run.info.status}\")\n", + " print(f\" Temperature: {temp}\")\n", + " print(f\" Max Tokens: {max_tokens}\")\n", + " print(f\" Duration: {duration} ms\")\n", + " print(f\" Total Tokens: {total_tokens}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"SUMMARY:\")\n", + " successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n", + " durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n", + " avg_duration = sum(durations) / len(durations) if durations else 0\n", + " \n", + " print(f\" Total Runs: {len(runs)}\")\n", + " print(f\" Successful: {successful}\")\n", + " print(f\" Failed: {len(runs) - successful}\")\n", + " print(f\" Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \" Avg Duration: N/A\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"MLflow verification complete\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/ray.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/ray.ipynb new file mode 100644 index 000000000..885379c7e --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/ray.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e4a6ac7c-5c73-42a9-8b74-420788321543", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install ray==2.41.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "788f1517-251c-4171-af7d-f4c7a5073d71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install numpy mlflow tensorflow==2.20.0 \"ray[serve,default,client]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8deec5c-6239-4087-8a4d-27c091e9fc3c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import mlflow\n", + "import mlflow.tensorflow\n", + "import numpy as np\n", + "\n", + "from sklearn.datasets import load_diabetes\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers\n", + "\n", + "# -------------------\n", + "# Prepare Data\n", + "# -------------------\n", + "data = load_diabetes()\n", + "X = data.data\n", + "y = data.target.reshape(-1, 1)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# -------------------\n", + "# Define Model\n", + "# -------------------\n", + "def create_model(input_dim):\n", + " model = keras.Sequential([\n", + " layers.Dense(64, activation=\"relu\", input_shape=(input_dim,)),\n", + " layers.Dense(32, activation=\"relu\"),\n", + " layers.Dense(1) # regression output\n", + " ])\n", + " model.compile(optimizer=\"adam\", loss=\"mse\", metrics=[\"mse\"])\n", + " return model\n", + "\n", + "input_dim = X_train.shape[1]\n", + "epochs = 50\n", + "batch_size = 32\n", + "\n", + "mlflow.set_experiment(\"Diabetes_Prediction_TensorFlow\")\n", + "\n", + "with mlflow.start_run():\n", + " mlflow.log_param(\"epochs\", epochs)\n", + " mlflow.log_param(\"batch_size\", batch_size)\n", + " mlflow.log_param(\"optimizer\", \"adam\")\n", + " mlflow.log_param(\"loss_fn\", \"mse\")\n", + " mlflow.log_param(\"input_features\", input_dim)\n", + "\n", + " model = create_model(input_dim)\n", + "\n", + " # Train\n", + " history = model.fit(\n", + " X_train, y_train,\n", + " validation_data=(X_test, y_test),\n", + " epochs=epochs,\n", + " batch_size=batch_size,\n", + " verbose=0\n", + " )\n", + "\n", + " # Evaluation\n", + " loss, mse = model.evaluate(X_test, y_test, verbose=0)\n", + " rmse = np.sqrt(mse)\n", + "\n", + " mlflow.log_metric(\"mse\", mse)\n", + " mlflow.log_metric(\"rmse\", rmse)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "556ae0b2-6fa6-4271-9e7d-553cd7056aab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import mlflow.tensorflow\n", + "import tensorflow as tf\n", + "from starlette.requests import Request\n", + "from typing import Dict\n", + "\n", + "from ray import serve\n", + "import ray\n", + "\n", + "\n", + "@serve.deployment(\n", + " ray_actor_options={\n", + " \"runtime_env\": {\n", + " \"pip\": [\"tensorflow\"]\n", + " },\n", + " }\n", + ")\n", + "class TensorFlowMLflowDeployment:\n", + " def __init__(self):\n", + " print(\"Loading model...\")\n", + " self.model = model\n", + " print(\"Model loaded successfully.\")\n", + "\n", + " async def __call__(self, input_data) -> Dict:\n", + " try:\n", + " if isinstance(input_data, Request):\n", + " data = await input_data.json()\n", + " else:\n", + " data = input_data\n", + " features = data.get(\"features\", None)\n", + " if features is None:\n", + " return {\"error\": \"Missing 'features' in request\"}\n", + " X = np.array(features).reshape(1, -1)\n", + "\n", + " # Make prediction with TensorFlow model\n", + " prediction = self.model.predict(X).flatten().tolist()\n", + "\n", + " return {\"prediction\": prediction}\n", + " except Exception as e:\n", + " return {\"error\": str(e)}\n", + "\n", + "\n", + "# Bind and deploy\n", + "app = TensorFlowMLflowDeployment.bind()\n", + "handle = serve.run(app, route_prefix=\"/predict\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e264af73-6634-412b-9cbc-86b79c18e775", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "json_data = {\"features\": [0.0380759, 0.0506801, 0.0616962, 0.0218724, -0.0442235, -0.0348208, -0.0434008, -0.00259226, 0.0199084, -0.0176461]}\n", + "response = handle.remote(json_data)\n", + "await response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/requirements.txt b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/requirements.txt new file mode 100644 index 000000000..788936a73 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/requirements.txt @@ -0,0 +1,11 @@ +transformers==4.57.1 +torch==2.9.0 +tensorflow==2.20.0 +huggingface_hub==0.36.0 +numpy==2.3.4 +ipywidgets==8.1.8 +mlflow==2.19.0 +ollama==0.6.0 +panel==1.8.2 +ray==2.41.0 +psutil==7.1.3 diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/welcome.ipynb b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/welcome.ipynb new file mode 100644 index 000000000..52ef576d0 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/welcome.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5af4f666", + "metadata": {}, + "source": [ + "### Welcome Notebook - Notebook used to verify basic jupyterhub functionality and inference" + ] + }, + { + "cell_type": "markdown", + "id": "c85b6901", + "metadata": {}, + "source": [ + "**Purpose**: This notebook demonstrates semantic similarity search using the Qwen3-Embedding-0.6B model. It shows how to:\n", + "\n", + "1. Generate embeddings for search queries and documents.\n", + "2. Use instructed queries (queries with task descriptions) to improve retrieval quality.\n", + "3. Calculate similarity scores between queries and documents.\n", + "4. Identify which documents are most relevant to which queries.\n", + "\n", + "Use Case: Testing embedding model functionality in your JupyterHub environment. The example compares two queries (\"What is the capital of China?\" and \"Explain gravity\") against two documents to find the best matches. High scores (like 0.76) indicate strong semantic similarity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8048aa56-4549-4afa-b8b0-d111cc7020c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.7645573019981384, 0.14142529666423798], [0.13549786806106567, 0.5999557375907898]]\n" + ] + } + ], + "source": [ + "# Requires transformers>=4.51.0\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "from torch import Tensor\n", + "from transformers import AutoTokenizer, AutoModel\n", + "\n", + "\n", + "def last_token_pool(last_hidden_states: Tensor,\n", + " attention_mask: Tensor) -> Tensor:\n", + " left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n", + " if left_padding:\n", + " return last_hidden_states[:, -1]\n", + " else:\n", + " sequence_lengths = attention_mask.sum(dim=1) - 1\n", + " batch_size = last_hidden_states.shape[0]\n", + " return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\n", + "\n", + "\n", + "def get_detailed_instruct(task_description: str, query: str) -> str:\n", + " return f'Instruct: {task_description}\\nQuery:{query}'\n", + "\n", + "# Each query must come with a one-sentence instruction that describes the task\n", + "task = 'Given a web search query, retrieve relevant passages that answer the query'\n", + "\n", + "queries = [\n", + " get_detailed_instruct(task, 'What is the capital of China?'),\n", + " get_detailed_instruct(task, 'Explain gravity')\n", + "]\n", + "# No need to add instruction for retrieval documents\n", + "documents = [\n", + " \"The capital of China is Beijing.\",\n", + " \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\n", + "]\n", + "input_texts = queries + documents\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')\n", + "model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')\n", + "\n", + "# We recommend enabling flash_attention_2 for better acceleration and memory saving.\n", + "# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16).cuda()\n", + "\n", + "max_length = 8192\n", + "\n", + "# Tokenize the input texts\n", + "batch_dict = tokenizer(\n", + " input_texts,\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=max_length,\n", + " return_tensors=\"pt\",\n", + ")\n", + "batch_dict.to(model.device)\n", + "outputs = model(**batch_dict)\n", + "embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])\n", + "\n", + "# normalize embeddings\n", + "embeddings = F.normalize(embeddings, p=2, dim=1)\n", + "scores = (embeddings[:2] @ embeddings[2:].T)\n", + "print(scores.tolist())\n", + "# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/networkpolicy.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/networkpolicy.yaml new file mode 100644 index 000000000..d985d3ba8 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/networkpolicy.yaml @@ -0,0 +1,46 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-singleuser-egress-to-ray + namespace: default +spec: + podSelector: + matchLabels: + app: jupyterhub + component: singleuser-server + release: ai-starter-kit + policyTypes: ["Egress"] + egress: + - to: + - podSelector: + matchLabels: + ray.io/node-type: head + ports: + - protocol: TCP + port: 8265 + - protocol: TCP + port: 8000 + - protocol: TCP + port: 10001 +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-singleuser-egress-to-mlflow + namespace: default +spec: + podSelector: + matchLabels: + app: jupyterhub + component: singleuser-server + release: ai-starter-kit + policyTypes: ["Egress"] + egress: + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: mlflow + app.kubernetes.io/instance: ai-starter-kit + ports: + - protocol: TCP + port: 5000 \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/NOTES.txt b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/NOTES.txt new file mode 100644 index 000000000..4e33a20ed --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/NOTES.txt @@ -0,0 +1 @@ +AI Starter Kit installed. Enjoy \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/_helpers.tpl b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/_helpers.tpl new file mode 100644 index 000000000..cf0c5e081 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "ai-starter-kit.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "ai-starter-kit.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "ai-starter-kit.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "ai-starter-kit.labels" -}} +helm.sh/chart: {{ include "ai-starter-kit.chart" . }} +{{ include "ai-starter-kit.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "ai-starter-kit.selectorLabels" -}} +app.kubernetes.io/name: {{ include "ai-starter-kit.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "ai-starter-kit.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "ai-starter-kit.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/configmaps.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/configmaps.yaml new file mode 100644 index 000000000..e03429ee9 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/configmaps.yaml @@ -0,0 +1,18 @@ +--- +{{- /* +Create a single ConfigMap with all initialization files for the jupyterhub singleuser pod. +This ConfigMap is mounted as a volume. +*/ -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: ai-starter-kit-init-files + labels: + app.kubernetes.io/managed-by: {{ $.Release.Service | quote }} + app.kubernetes.io/instance: {{ $.Release.Name | quote }} + helm.sh/chart: "{{ $.Chart.Name }}-{{ $.Chart.Version }}" +data: +{{- range $path, $bytes := .Files.Glob "files/*" }} + {{ base $path | quote }}: |- +{{ $bytes | toString | nindent 4 }} +{{- end }} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/delete-pods-job.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/delete-pods-job.yaml new file mode 100644 index 000000000..d25283a4b --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/delete-pods-job.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ include "ai-starter-kit.fullname" . }}-delete-jupyter-admin-pod" + labels: + {{- include "ai-starter-kit.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: "{{ include "ai-starter-kit.fullname" . }}-delete-jupyter-admin-pod" + labels: + {{- include "ai-starter-kit.labels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "ai-starter-kit.fullname" . }}-jupyterhub-hub + restartPolicy: OnFailure + containers: + - name: delete-jupyter-admin-pod + image: alpine/kubectl:1.33.4 + command: ["/bin/sh", "-c"] + args: + - | + kubectl delete pod -l app.kubernetes.io/component=singleuser-server -n {{ .Release.Namespace }} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/generic-device-plugin.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/generic-device-plugin.yaml new file mode 100644 index 000000000..3e387f5ce --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/generic-device-plugin.yaml @@ -0,0 +1,65 @@ +{{- if .Values.genericDevicePlugin.enabled }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "ai-starter-kit.fullname" . }}-generic-device-plugin + namespace: kube-system + labels: + {{- include "ai-starter-kit.labels" . | nindent 4 }} + app.kubernetes.io/component: generic-device-plugin +spec: + selector: + matchLabels: + {{- include "ai-starter-kit.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: generic-device-plugin + template: + metadata: + labels: + {{- include "ai-starter-kit.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: generic-device-plugin + spec: + priorityClassName: system-node-critical + tolerations: + - operator: "Exists" + effect: "NoExecute" + - operator: "Exists" + effect: "NoSchedule" + containers: + - image: {{ .Values.genericDevicePlugin.image.repository | default "squat/generic-device-plugin" }}:{{ .Values.genericDevicePlugin.image.tag | default "latest" }} + imagePullPolicy: {{ .Values.genericDevicePlugin.image.pullPolicy | default "IfNotPresent" }} + name: generic-device-plugin + args: + - --device + - | + name: dri + groups: + - count: {{ .Values.genericDevicePlugin.device.count | default 4 }} + paths: + - path: /dev/dri + resources: + requests: + cpu: {{ .Values.genericDevicePlugin.resources.requests.cpu | default "50m" }} + memory: {{ .Values.genericDevicePlugin.resources.requests.memory | default "10Mi" }} + limits: + cpu: {{ .Values.genericDevicePlugin.resources.limits.cpu | default "50m" }} + memory: {{ .Values.genericDevicePlugin.resources.limits.memory | default "20Mi" }} + ports: + - containerPort: 8080 + name: http + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev + mountPath: /dev + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev + hostPath: + path: /dev + updateStrategy: + type: RollingUpdate +{{- end }} \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/hf-secret.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/hf-secret.yaml new file mode 100644 index 000000000..308b0a94a --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/hf-secret.yaml @@ -0,0 +1,13 @@ +{{- if .Values.huggingface.token }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-hf-token-secret + labels: + app.kubernetes.io/managed-by: {{ .Release.Service | quote }} + app.kubernetes.io/instance: {{ .Release.Name | quote }} + helm.sh/chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" +type: Opaque +stringData: + token: {{ .Values.huggingface.token }} +{{- end }} \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/local-pv.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/local-pv.yaml new file mode 100644 index 000000000..0797b93e3 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/local-pv.yaml @@ -0,0 +1,16 @@ +{{- if .Values.localPersistence.enabled }} +apiVersion: v1 +kind: PersistentVolume +metadata: + name: {{ .Release.Name }}-models-cache-pv + labels: + type: local +spec: + storageClassName: manual + capacity: + storage: {{ .Values.modelsCachePvc.size }} + accessModes: + - ReadWriteOnce + hostPath: + path: "{{ .Values.localPersistence.hostPath }}" +{{- end }} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-mc-only.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-mc-only.yaml new file mode 100644 index 000000000..99179f9f1 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-mc-only.yaml @@ -0,0 +1,28 @@ +{{- if .Values.modelsCacheOnlyPvc.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-models-cache-only-pvc + labels: + app.kubernetes.io/managed-by: {{ .Release.Service | quote }} + app.kubernetes.io/instance: {{ .Release.Name | quote }} + helm.sh/chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" +spec: + accessModes: + {{- toYaml .Values.modelsCacheOnlyPvc.accessModes | nindent 4 }} + {{- if .Values.localPersistence.enabled }} + storageClassName: manual + {{- else }} + {{- /* + If storageClassName is set to a specific class, it will be used. + If storageClassName is set to an empty string (""), no storage class will be used for provisioning. + If storageClassName is null or omitted, the default storage class will be used. + */}} + {{- if or .Values.modelsCacheOnlyPvc.storageClassName (eq .Values.modelsCacheOnlyPvc.storageClassName "") }} + storageClassName: {{ .Values.modelsCacheOnlyPvc.storageClassName | quote }} + {{- end }} + {{- end }} + resources: + requests: + storage: {{ .Values.modelsCacheOnlyPvc.size }} +{{- end -}} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-ray.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-ray.yaml new file mode 100644 index 000000000..cb4ae5b1d --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc-ray.yaml @@ -0,0 +1,28 @@ +{{- if .Values.rayPvc.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-ray-pvc + labels: + app.kubernetes.io/managed-by: {{ .Release.Service | quote }} + app.kubernetes.io/instance: {{ .Release.Name | quote }} + helm.sh/chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" +spec: + accessModes: + {{- toYaml .Values.rayPvc.accessModes | nindent 4 }} + {{- if .Values.localPersistence.enabled }} + storageClassName: manual + {{- else }} + {{- /* + If storageClassName is set to a specific class, it will be used. + If storageClassName is set to an empty string (""), no storage class will be used for provisioning. + If storageClassName is null or omitted, the default storage class will be used. + */}} + {{- if or .Values.rayPvc.storageClassName (eq .Values.rayPvc.storageClassName "") }} + storageClassName: {{ .Values.rayPvc.storageClassName | quote }} + {{- end }} + {{- end }} + resources: + requests: + storage: {{ .Values.rayPvc.size }} +{{- end -}} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc.yaml new file mode 100644 index 000000000..36ba98fdc --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/pvc.yaml @@ -0,0 +1,28 @@ +{{- if .Values.modelsCachePvc.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-models-cache-pvc + labels: + app.kubernetes.io/managed-by: {{ .Release.Service | quote }} + app.kubernetes.io/instance: {{ .Release.Name | quote }} + helm.sh/chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" +spec: + accessModes: + {{- toYaml .Values.modelsCachePvc.accessModes | nindent 4 }} + {{- if .Values.localPersistence.enabled }} + storageClassName: manual + {{- else }} + {{- /* + If storageClassName is set to a specific class, it will be used. + If storageClassName is set to an empty string (""), no storage class will be used for provisioning. + If storageClassName is null or omitted, the default storage class will be used. + */}} + {{- if or .Values.modelsCachePvc.storageClassName (eq .Values.modelsCachePvc.storageClassName "") }} + storageClassName: {{ .Values.modelsCachePvc.storageClassName | quote }} + {{- end }} + {{- end }} + resources: + requests: + storage: {{ .Values.modelsCachePvc.size }} +{{- end -}} \ No newline at end of file diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/ramalama-deployment.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/ramalama-deployment.yaml new file mode 100644 index 000000000..0accdfebe --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/templates/ramalama-deployment.yaml @@ -0,0 +1,89 @@ +{{- if .Values.ramalama.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "ai-starter-kit.fullname" . }}-ramalama + labels: + {{- include "ai-starter-kit.labels" . | nindent 4 }} + app.kubernetes.io/component: ramalama +spec: + replicas: 1 + selector: + matchLabels: + {{- include "ai-starter-kit.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: ramalama + template: + metadata: + labels: + {{- include "ai-starter-kit.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: ramalama + spec: + {{- if .Values.ramalama.nodeSelector }} + nodeSelector: + {{- toYaml .Values.ramalama.nodeSelector | nindent 8 }} + {{- end }} + containers: + - name: ramalama + image: "{{ .Values.ramalama.image.repository }}:{{ .Values.ramalama.image.tag }}" + imagePullPolicy: {{ .Values.ramalama.image.pullPolicy }} + command: + - ramalama + - --store + - "/mnt/ramalama-store" + - serve + - {{ .Values.ramalama.models.serve }} + - "--port" + - "8080" + {{- if .Values.ramalama.persistentVolume.enabled }} + volumeMounts: + - name: store + mountPath: "/mnt/ramalama-store" + subPath: {{ .Values.ramalama.persistentVolume.subPath }} + {{- end }} + ports: + - containerPort: 8080 + protocol: TCP + {{- if .Values.ramalama.resources }} + resources: + {{- toYaml .Values.ramalama.resources | nindent 10 }} + {{- end }} + initContainers: + - name: init + image: "{{ .Values.ramalama.image.repository }}:{{ .Values.ramalama.image.tag }}" + imagePullPolicy: {{ .Values.ramalama.image.pullPolicy }} + command: + - "bash" + - "-c" + - | + ramalama --store /mnt/ramalama-store pull {{ .Values.ramalama.models.serve }} + {{- if .Values.ramalama.persistentVolume.enabled }} + volumeMounts: + - name: store + mountPath: "/mnt/ramalama-store" + subPath: {{ .Values.ramalama.persistentVolume.subPath }} + {{- end }} + {{- if .Values.ramalama.persistentVolume.enabled }} + volumes: + - name: store + persistentVolumeClaim: + claimName: {{ .Values.ramalama.persistentVolume.existingClaim }} + {{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "ai-starter-kit.fullname" . }}-ramalama + labels: + {{- include "ai-starter-kit.labels" . | nindent 4 }} + app.kubernetes.io/component: ramalama +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + name: http + selector: + {{- include "ai-starter-kit.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: ramalama +{{- end }} diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/values.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/values.yaml new file mode 100644 index 000000000..ed0e98c59 --- /dev/null +++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/values.yaml @@ -0,0 +1,200 @@ +jupyterhub: + nameOverride: "jupyterhub" + # This value has to be null in order to apply release name on this chart's resources. + # https://github.com/jupyterhub/zero-to-jupyterhub-k8s/blob/b4b51301ac886511c643cc5d428b15ff38006bee/jupyterhub/values.yaml#L1 + fullnameOverride: + + singleuser: + networkPolicy: + enabled: false + defaultUrl: "/lab/tree/welcome.ipynb" + image: + name: jupyterhub/k8s-singleuser-sample + tag: "4.2.0" + initContainers: + # This init cntainer makes sure that home folder that we mount has correct owner + - name: chown-home-mount-dir + image: jupyterhub/k8s-singleuser-sample:4.2.0 + securityContext: + runAsUser: 0 + command: ["chown", "jovyan", "/home/jovyan"] + volumeMounts: + - name: home + mountPath: /home/jovyan + subPath: jupyterhub_workspace + + - name: model-initializer + image: jupyterhub/k8s-singleuser-sample:4.2.0 + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: ai-starter-kit-hf-token-secret + key: token + command: + - /bin/sh + - -c + - | + set -e + pip install -r /tmp/requirements.txt + + python /tmp/download_models.py + + # populate workspace with initial notebook files + for f in /tmp/*.ipynb; do + if [ -f "$f" ]; then + # Use cp -n to not overwrite existing files. + cp -n "$f" /home/jovyan/ + fi + done + volumeMounts: + # This 'home' volume is created by the helm chart's 'homeMountPath' option. + # We mount it to initContainer too, so all downloads and installations are persisted in this mounted home folder. + - name: home + mountPath: /home/jovyan + subPath: jupyterhub_workspace + - name: init-files + mountPath: /tmp + readOnly: true + + storage: + type: static + static: + pvcName: "ai-starter-kit-models-cache-pvc" + subPath: "jupyterhub_workspace" + capacity: 20Gi + homeMountPath: /home/jovyan + extraVolumes: + - name: init-files + configMap: + name: "ai-starter-kit-init-files" + # This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv + extraEnv: + HF_TOKEN: + name: HF_TOKEN + valueFrom: + secretKeyRef: + name: ai-starter-kit-hf-token-secret + key: token + RAY_ADDRESS: "ray://ai-starter-kit-kuberay-head-svc:10001" + MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000" + hub: + networkPolicy: + enabled: false + db: + type: sqlite-pvc + pvc: + annotations: + # Without this helm will not keep the pvc after uninstallation + # https://github.com/jupyterhub/zero-to-jupyterhub-k8s/issues/3718 + helm.sh/resource-policy: keep + extraConfig: + 00-dummy-authenticator: | + c.DummyAuthenticator.password = "password" + 01-spawner-timeouts: | + c.KubeSpawner.start_timeout = 1800 + proxy: + chp: + networkPolicy: + enabled: false + traefik: + networkPolicy: + enabled: false + +ray-cluster: + enabled: false + image: + tag: "2.41.0-py312-cpu" + head: + serviceType: ClusterIP + resources: + requests: + cpu: "1" + memory: "2G" + ephemeral-storage: 10Gi + limits: + cpu: "4" + memory: "8G" + ephemeral-storage: 10Gi + worker: + resources: + requests: + cpu: "1" + memory: "2G" + ephemeral-storage: 10Gi + limits: + cpu: "4" + memory: "8G" + ephemeral-storage: 10Gi + +mlflow: + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + +huggingface: + # Provide your Hugging Face token here to download gated or private models. + # It is recommended to set this via --set or a separate values file, e.g., + # --set huggingface.token=hf_... + token: "" + +rayPvc: + enabled: false + storageClassName: "standard-rwo" + accessModes: + - ReadWriteOnce + size: 100Gi + +modelsCachePvc: + enabled: true + # To use the default StorageClass, set storageClassName to null or omit it. + # To use a specific StorageClass (e.g. "standard-rwo" on GKE), provide its name. + # To create a PVC that doesn't request any StorageClass, set it to an empty string (""). + storageClassName: "standard-rwo" + accessModes: + - ReadWriteOnce + size: 10Gi + +modelsCacheOnlyPvc: + enabled: false + +localPersistence: + # For local development with minikube, this allows persisting the models-cache + # on the host machine, surviving `minikube stop/start`. + # 1. Create a directory on your host: `mkdir -p /data/models-cache` + # 2. Start minikube with the mount: `minikube start --mount --mount-string="/data/models-cache:/data/models-cache"` + # 3. Set enabled to true below, or via `--set localPersistence.enabled=true` + enabled: true + # This path must match the destination path inside the minikube node. + hostPath: "/tmp/models-cache" + +ollama: + enabled: true + ollama: + models: + pull: + - gemma3 + persistentVolume: + enabled: true + existingClaim: "ai-starter-kit-models-cache-pvc" + subPath: "ollama" + +ramalama: + enabled: true + image: + repository: "quay.io/ramalama/ramalama" + tag: "latest" + pullPolicy: IfNotPresent + persistentVolume: + enabled: true + existingClaim: "ai-starter-kit-models-cache-pvc" + subPath: "ramalama" + models: + serve: qwen2.5:1.5b + +genericDevicePlugin: + enabled: false