diff --git a/README.md b/README.md index 5cd1643..6b77afc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,142 @@ -# jetson-ollama +# Jetson-ollama + +Simple hosting of ollama and additional value add services on a Jetson Orin. + +## Install + +If you are ever using a Jetpack version that doesn't have pre-built images yet, you will want to install https://github.com/dusty-nv/jetson-containers and use the commands to build your own docker containers. + +This repo simply hosts a docker-compose file and a systemd service file to ensure the docker-compose services are started on boot. + + +``` +sudo mkdir -p /opt/ollama +sudo cp docker-compose-ollama.yaml /opt/ollama/docker-compose-ollama.yaml +``` + +``` +sudo cp ollama.service /etc/systemd/system/ollama.service +sudo systemctl enable ollama.service +sudo systemctl start ollama.service +``` + +## Hosting a Model + +You now will need pull and then run a model: + +``` +curl http://localhost:11434/api/pull -d '{ + "model": "mistral-nemo" +}' +``` + +Output: +``` +{"status":"pulling manifest"} +{"status":"pulling b559938ab7a0","digest":"sha256:b559938ab7a0392fc9ea9675b82280f2a15669ec3e0e0fc491c9cb0a7681cf94","total":7071700672,"completed":7071700672} +{"status":"pulling f023d1ce0e55","digest":"sha256:f023d1ce0e55d0dcdeaf70ad81555c2a20822ed607a7abd8de3c3131360f5f0a","total":688,"completed":688} +{"status":"pulling 43070e2d4e53","digest":"sha256:43070e2d4e532684de521b885f385d0841030efa2b1a20bafb76133a5e1379c1","total":11356,"completed":11356} +{"status":"pulling ed11eda7790d","digest":"sha256:ed11eda7790d05b49395598a42b155812b17e263214292f7b87d15e14003d337","total":30,"completed":30} +{"status":"pulling 65d37de20e59","digest":"sha256:65d37de20e5951c7434ad4230c51a4d5be99b8cb7407d2135074d82c40b44b45","total":486,"completed":486} +{"status":"verifying sha256 digest"} +{"status":"writing manifest"} +{"status":"success"} +``` + +We can validate that we at least have one model available by checking the tags: + +``` +curl http://0.0.0.0:11434/api/tags +``` + +Output: +``` +{"models":[{"name":"mistral-nemo:latest","model":"mistral-nemo:latest","modified_at":"2024-12-25T00:01:25.24932255Z","size":7071713232,"digest":"994f3b8b78011aa6d578b0c889cbb89a64b778f80d73b8d991a8db1f1e710ace","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"12.2B","quantization_level":"Q4_0"}}]} +``` + +We can check if we have any running models: + +``` +curl http://0.0.0.0:11434/api/ps +``` + +Output: +``` +{"models":[]} +``` + +We can load a model model into memory by submitting an empty request: + +``` +curl http://0.0.0.0:11434/api/generate -d '{ + "model": "mistral-nemo" +}' +``` + +Output: +``` +{"model":"mistral-nemo","created_at":"2024-12-25T00:13:16.691913415Z","response":"","done":true,"done_reason":"load"} +``` + +``` +curl http://0.0.0.0:11434/api/ps +``` + +Output: +``` +{"models":[{"name":"mistral-nemo:latest","model":"mistral-nemo:latest","size":9290250240,"digest":"994f3b8b78011aa6d578b0c889cbb89a64b778f80d73b8d991a8db1f1e710ace","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"12.2B","quantization_level":"Q4_0"},"expires_at":"2024-12-25T00:18:16.692451257Z","size_vram":9290250240}]}% +``` + +This shows that the model will remain in memory for three hours, then it will expire and be removed: + +``` +date -u +Wed Dec 25 00:15:21 UTC 2024 +``` + +If we want to keep the model around indefinitely in memory you can use the `keep_alive` parameter. A value of `-1` will set an inifinte expirey. A value of `0` will unload the model from memory. Other values follow the form [5m, 1h, 3d]. + +``` +curl http://0.0.0.0:11434/api/generate -d '{ + "model": "mistral-nemo" + "keep_alive": -1 +}' +``` + +Lastly, we can submit a chat request like + + +``` +curl http://0.0.0.0:11434/api/chat -d '{ + "model": "mistral-nemo", + "messages": [ + { + "role": "user", + "content": "Why is the sky blue?" + } + ] +}' +``` + +Output: +``` +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:08.203017147Z","message":{"role":"assistant","content":"The"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:08.38823807Z","message":{"role":"assistant","content":" sky"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:08.546712356Z","message":{"role":"assistant","content":" appears"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:08.705009365Z","message":{"role":"assistant","content":" blue"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:08.858441986Z","message":{"role":"assistant","content":" due"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.011519221Z","message":{"role":"assistant","content":" to"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.164679014Z","message":{"role":"assistant","content":" a"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.317643996Z","message":{"role":"assistant","content":" phenomenon"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.470739343Z","message":{"role":"assistant","content":" called"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.623952448Z","message":{"role":"assistant","content":" Ray"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.777503755Z","message":{"role":"assistant","content":"leigh"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:09.929986217Z","message":{"role":"assistant","content":" scattering"},"done":false} +{"model":"mistral-nemo","created_at":"2024-12-25T00:26:10.083242522Z","message":{"role":"assistant","content":"."},"done":false} +... +``` + +Which will continue until the stop token is generated by the model. Naturally this API method is cumbersome for general use, so for that we also included the [Open WebUI](https://github.com/open-webui/open-webui) which is hosted port 11433. + + -Simple hosting of ollama and additional value add services on a Jetson Orin \ No newline at end of file diff --git a/docker-compose-ollama.yaml b/docker-compose-ollama.yaml new file mode 100644 index 0000000..d2f9adb --- /dev/null +++ b/docker-compose-ollama.yaml @@ -0,0 +1,45 @@ +version: '3' + +services: + ollama: + image: dustynv/ollama:r36.4.0 + runtime: nvidia + command: /bin/ollama serve + ports: + - 11434:11434 + environment: + - OLLAMA_MODEL=mistral-nemo + # - OLLAMA_KEEP_ALIVE=24h + - OLLAMA_MODELS=/ollama + - OLLAMA_LOGS=/ollama/ollama.log + volumes: + - ~/opt/ollama:/ollama + restart: unless-stopped + networks: + - ollama-docker + + ollama-webui: + image: ghcr.io/open-webui/open-webui:main + container_name: ollama-webui + volumes: + - ./opt/ollama/:/app/backend/data + depends_on: + - ollama + ports: + - 11433:8080 + environment: # https://docs.openwebui.com/getting-started/advanced-topics/env-configuration + - OLLAMA_BASE_URL=http://ollama:11434 #comma separated ollama hosts + - OLLAMA_HOSTNAME=${OLLAMA_HOSTNAME} + - ENABLE_OPENAI_API=False + - ENV=dev + - WEBUI_AUTH=False + - WEBUI_NAME=Runcible AI + - WEBUI_URL=http://${OLLAMA_HOSTNAME}:11433 + - WEBUI_SECRET_KEY=t0p-s3cr3t + restart: unless-stopped + networks: + - ollama-docker + +networks: + ollama-docker: + external: false diff --git a/ollama.service b/ollama.service new file mode 100644 index 0000000..f22bad5 --- /dev/null +++ b/ollama.service @@ -0,0 +1,16 @@ +[Unit] +Description=Ollama LLM Service +Requires=docker.service +After=docker.service + +[Service] +Type=oneshot +RemainAfterExit=yes +WorkingDirectory=/opt/ollama +ExecStartPre=/bin/sh -c 'echo OLLAMA_HOSTNAME=$(hostname) > /opt/ollama/.env' +ExecStart=docker compose --env-file /opt/ollama/.env -f /opt/ollama/docker-compose-ollama.yaml up -d +ExecStop=docker compose -f /opt/ollama/docker-compose-ollama.yaml down + +[Install] +WantedBy=multi-user.target +