Skip to content

Commit a4e6a9b

Browse files
committed
build: compose and install script updates for disk-collector sidecar
1 parent 5113cc3 commit a4e6a9b

File tree

5 files changed

+296
-53
lines changed

5 files changed

+296
-53
lines changed

install/install_nomad.sh

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ START_SCRIPT_URL="https://raw.githubusercontent.com/Crosstalk-Solutions/project-
3838
STOP_SCRIPT_URL="https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/main/install/stop_nomad.sh"
3939
UPDATE_SCRIPT_URL="https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/main/install/update_nomad.sh"
4040
WAIT_FOR_IT_SCRIPT_URL="https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh"
41-
COLLECT_DISK_INFO_SCRIPT_URL="https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/main/install/collect_disk_info.sh"
4241

4342
script_option_debug='true'
4443
accepted_terms='false'
@@ -381,12 +380,6 @@ create_nomad_directory(){
381380
sudo touch "${NOMAD_DIR}/storage/logs/admin.log"
382381
}
383382

384-
create_disk_info_file() {
385-
# Disk info file MUST be created before the admin container starts.
386-
# Otherwise, Docker will assume we meant to mount a directory and will create an empty directory at the mount point
387-
echo '{}' > /tmp/nomad-disk-info.json
388-
}
389-
390383
download_management_compose_file() {
391384
local compose_file_path="${NOMAD_DIR}/compose.yml"
392385

@@ -463,24 +456,6 @@ download_sidecar_files() {
463456
echo -e "${GREEN}#${RESET} Sidecar updater script downloaded successfully to $sidecar_script_path.\\n"
464457
}
465458

466-
download_and_start_collect_disk_info_script() {
467-
local collect_disk_info_script_path="${NOMAD_DIR}/collect_disk_info.sh"
468-
469-
echo -e "${YELLOW}#${RESET} Downloading collect_disk_info script...\\n"
470-
if ! curl -fsSL "$COLLECT_DISK_INFO_SCRIPT_URL" -o "$collect_disk_info_script_path"; then
471-
echo -e "${RED}#${RESET} Failed to download the collect_disk_info script. Please check the URL and try again."
472-
exit 1
473-
fi
474-
chmod +x "$collect_disk_info_script_path"
475-
echo -e "${GREEN}#${RESET} collect_disk_info script downloaded successfully to $collect_disk_info_script_path.\\n"
476-
477-
# Start script in background and store PID for easy removal on uninstall
478-
echo -e "${YELLOW}#${RESET} Starting collect_disk_info script in the background...\\n"
479-
nohup bash "$collect_disk_info_script_path" > /dev/null 2>&1 &
480-
echo $! > "${NOMAD_DIR}/nomad-collect-disk-info.pid"
481-
echo -e "${GREEN}#${RESET} collect_disk_info script started successfully.\\n"
482-
}
483-
484459
download_helper_scripts() {
485460
local start_script_path="${NOMAD_DIR}/start_nomad.sh"
486461
local stop_script_path="${NOMAD_DIR}/stop_nomad.sh"
@@ -609,7 +584,6 @@ download_wait_for_it_script
609584
download_entrypoint_script
610585
download_sidecar_files
611586
download_helper_scripts
612-
download_and_start_collect_disk_info_script
613587
download_management_compose_file
614588
start_management_containers
615589
verify_gpu_setup

install/management_compose.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ services:
1111
- "8080:8080"
1212
volumes:
1313
- /opt/project-nomad/storage:/app/storage
14-
- /tmp/nomad-disk-info.json:/app/storage/nomad-disk-info.json
1514
- /var/run/docker.sock:/var/run/docker.sock # Allows the admin service to communicate with the Host's Docker daemon
1615
- ./entrypoint.sh:/usr/local/bin/entrypoint.sh
1716
- ./wait-for-it.sh:/usr/local/bin/wait-for-it.sh
@@ -95,6 +94,14 @@ services:
9594
- /var/run/docker.sock:/var/run/docker.sock # Allows communication with the Host's Docker daemon
9695
- /opt/project-nomad:/opt/project-nomad # Writable access required so the updater can set the correct image tag in compose.yml
9796
- nomad-update-shared:/shared # Shared volume for communication with admin container
97+
disk-collector:
98+
image: ghcr.io/crosstalk-solutions/project-nomad-disk-collector:latest
99+
pull_policy: always
100+
container_name: nomad_disk_collector
101+
restart: unless-stopped
102+
volumes:
103+
- /:/host:ro,rslave # Read-only view of host FS with rslave propagation so /sys and /proc submounts are visible
104+
- /opt/project-nomad/storage:/storage
98105

99106
volumes:
100107
nomad-update-shared:

install/migrate-disk-collector.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Project N.O.M.A.D. — About the Disk Collector Migration Script
2+
3+
This script migrates your Project N.O.M.A.D. installation from the old host-based disk info collector to the new disk-collector sidecar. It modifies `/opt/project-nomad/compose.yml` to add the new service and remove the old bind mount, then restarts the full compose stack to apply the changes.
4+
5+
### Why the Migration?
6+
The new disk-collector sidecar provides a more robust and scalable way to collect disk information from the host. It removes the original bind mount to `/tmp/nomad-disk-info.json`, which was fragile and prone to issues on host reboots.
7+
8+
The original host-based collector relied on a process running on the host that wrote disk info to a file, which was then read by the admin container via a bind mount. This approach had several drawbacks:
9+
- The host process could fail or be killed, leading to stale or missing disk info.
10+
- The bind mount to `/tmp/nomad-disk-info.json` was cleared on host reboots, causing Docker to create a directory at the mount point instead of a file.
11+
- Necessitated a tighter coupling to the host, which would make more flexible future deployment options tougher to achieve.
12+
13+
The migration script automates the necessary changes to your compose configuration and ensures a smooth transition to the new architecture.
14+
15+
### Why does Nomad need the nomad-disk-info.json file?
16+
Nomad uses the disk info stored and updated in `nomad-disk-info.json` to allow users to view disk usage and availability within the Nomad "Command Center". While not critical to the core functionality of Nomad, it provides a more pleasant experience for users with limited storage space and/or who aren't familiar with command-line tools and Linux management.
17+
18+
### Why a separate container?
19+
The disk-collector runs in a separate container to isolate its functionality from the main admin container. This separation provides several benefits:
20+
- **Stability**: If the disk-collector encounters an issue or crashes, it won't affect the main admin container and vice versa.
21+
- **Security**: The main admin container already has significant host access via the Docker socket, storage directory, and host.docker.internal. Additionally, Nomad may add more features in the future that support multi-user environments and/or more network exposure, so isolating the disk-collector reduces the exposure of the host filesystem (even if read-only) to just the one container, which has a very limited scope of functionality and access.
22+
- **Modularity**: Because having the host disk info is not a critical component of Nomad's core functionality, isolating it in a sidecar allows users who don't need/want the disk info features to simply not run that container, without impacting the main admin container or other services. It also allows for more flexible future development of the disk-collector without needing to modify the main admin container.
23+
24+
### What if I don't want to run the migration script?
25+
No worries - you can replicate the changes manually by editing your `/opt/project-nomad/compose.yml` to add the new disk-collector service and remove the old bind mount from the admin service, then restarting your compose stack. The migration script just automates these steps and ensures they're done correctly, but the underlying changes are straightforward if you prefer to do it yourself. Just be sure to back up your `compose.yml` before making any changes.
26+
27+
Here's the disk-collector service configuration to add to your `compose.yml`:
28+
29+
```yml
30+
disk-collector:
31+
image: ghcr.io/crosstalk-solutions/project-nomad-disk-collector:latest
32+
pull_policy: always
33+
container_name: nomad_disk_collector
34+
restart: unless-stopped
35+
volumes:
36+
- /:/host:ro,rslave # Read-only view of host FS with rslave propagation so /sys and /proc submounts are visible
37+
- /opt/project-nomad/storage:/storage
38+
```
39+
40+
and remove the `- /tmp/nomad-disk-info.json:/app/storage/nomad-disk-info.json` bind mount from the admin service volumes.

install/migrate-disk-collector.sh

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
#!/bin/bash
2+
3+
# Project N.O.M.A.D. — Disk Collector Migration Script
4+
#
5+
# Script | Project N.O.M.A.D. Disk Collector Migration Script
6+
# Version | 1.0.0
7+
# Author | Crosstalk Solutions, LLC
8+
# Website | https://crosstalksolutions.com
9+
#
10+
# PURPOSE:
11+
# One-time migration from the host-based disk info collector to the
12+
# disk-collector Docker sidecar. The old approach used a nohup background
13+
# process that wrote to /tmp/nomad-disk-info.json, which was bind-mounted
14+
# into the admin container. This broke on host reboots because /tmp is
15+
# cleared and Docker would create a directory at the mount point instead of a file.
16+
#
17+
# The new approach uses a disk-collector sidecar container that reads host
18+
# disk info via the /:/host:ro,rslave bind-mount pattern (same pattern as Prometheus
19+
# node-exporter, and no SYS_ADMIN or privileged capabilities required) and writes directly to
20+
# /opt/project-nomad/storage/nomad-disk-info.json, which the admin container
21+
# already reads via its existing storage bind-mount. Thus, no admin image update
22+
# or new volume mounts required.
23+
24+
###############################################################################
25+
# Color Codes
26+
###############################################################################
27+
28+
RESET='\033[0m'
29+
YELLOW='\033[1;33m'
30+
RED='\033[1;31m'
31+
GREEN='\033[1;32m'
32+
WHITE_R='\033[39m'
33+
34+
###############################################################################
35+
# Constants
36+
###############################################################################
37+
38+
NOMAD_DIR="/opt/project-nomad"
39+
COMPOSE_FILE="${NOMAD_DIR}/compose.yml"
40+
COMPOSE_PROJECT_NAME="project-nomad"
41+
42+
###############################################################################
43+
# Pre-flight Checks
44+
###############################################################################
45+
46+
check_is_bash() {
47+
if [[ -z "$BASH_VERSION" ]]; then
48+
echo -e "${RED}#${RESET} This script must be run with bash."
49+
echo -e "${RED}#${RESET} Example: bash $(basename "$0")"
50+
exit 1
51+
fi
52+
echo -e "${GREEN}#${RESET} Running in bash.\n"
53+
}
54+
55+
check_has_sudo() {
56+
if sudo -n true 2>/dev/null; then
57+
echo -e "${GREEN}#${RESET} Sudo permissions confirmed.\n"
58+
else
59+
echo -e "${RED}#${RESET} This script requires sudo permissions."
60+
echo -e "${RED}#${RESET} Example: sudo bash $(basename "$0")"
61+
exit 1
62+
fi
63+
}
64+
65+
check_confirmation() {
66+
echo -e "${YELLOW}#${RESET} This script migrates your Project N.O.M.A.D. installation from the"
67+
echo -e "${YELLOW}#${RESET} host-based disk info collector to the new disk-collector sidecar."
68+
echo -e "${YELLOW}#${RESET} It will modify compose.yml and restart the full compose stack"
69+
echo -e "${YELLOW}#${RESET} to drop the old /tmp bind mount and start the disk-collector sidecar."
70+
echo -e "${YELLOW}#${RESET} Please ensure you have a backup of your data before proceeding."
71+
read -rp "Do you want to continue? (y/N) " response
72+
if [[ ! "$response" =~ ^[Yy]$ ]]; then
73+
echo -e "${RED}#${RESET} Aborting. No changes have been made."
74+
exit 0
75+
fi
76+
echo -e "${GREEN}#${RESET} Confirmation received. Proceeding with migration...\n"
77+
}
78+
79+
check_docker_running() {
80+
if ! command -v docker &>/dev/null; then
81+
echo -e "${RED}#${RESET} Docker is not installed. Cannot proceed."
82+
exit 1
83+
fi
84+
if ! systemctl is-active --quiet docker; then
85+
echo -e "${RED}#${RESET} Docker is not running. Please start Docker and try again."
86+
exit 1
87+
fi
88+
echo -e "${GREEN}#${RESET} Docker is running.\n"
89+
}
90+
91+
check_compose_file() {
92+
if [[ ! -f "$COMPOSE_FILE" ]]; then
93+
echo -e "${RED}#${RESET} compose.yml not found at ${COMPOSE_FILE}."
94+
echo -e "${RED}#${RESET} Project N.O.M.A.D. does not appear to be installed or compose.yml is missing."
95+
exit 1
96+
fi
97+
echo -e "${GREEN}#${RESET} Found compose.yml at ${COMPOSE_FILE}.\n"
98+
}
99+
100+
# Step 1: Stop old host process
101+
stop_old_host_process() {
102+
local pid_file="${NOMAD_DIR}/nomad-collect-disk-info.pid"
103+
104+
if [[ -f "$pid_file" ]]; then
105+
echo -e "${YELLOW}#${RESET} Stopping old collect-disk-info background process..."
106+
local pid
107+
pid=$(cat "$pid_file")
108+
if kill "$pid" 2>/dev/null; then
109+
echo -e "${GREEN}#${RESET} Process ${pid} stopped.\n"
110+
else
111+
echo -e "${YELLOW}#${RESET} Process ${pid} was not running (already stopped).\n"
112+
fi
113+
rm -f "$pid_file"
114+
else
115+
echo -e "${GREEN}#${RESET} No old collect-disk-info PID file found — nothing to stop.\n"
116+
fi
117+
}
118+
119+
# Step 2: Backup compose.yml
120+
backup_compose_file() {
121+
local backup="${COMPOSE_FILE}.bak.$(date +%Y%m%d%H%M%S)"
122+
echo -e "${YELLOW}#${RESET} Backing up compose.yml to ${backup}..."
123+
if cp "$COMPOSE_FILE" "$backup"; then
124+
echo -e "${GREEN}#${RESET} Backup created at ${backup}.\n"
125+
else
126+
echo -e "${RED}#${RESET} Failed to create backup. Aborting."
127+
exit 1
128+
fi
129+
}
130+
131+
# Step 3: Remove old bind-mount from admin volumes
132+
remove_old_bind_mount() {
133+
if ! grep -q 'nomad-disk-info\.json' "$COMPOSE_FILE"; then
134+
echo -e "${GREEN}#${RESET} Old /tmp/nomad-disk-info.json bind-mount not found — already removed.\n"
135+
return 0
136+
fi
137+
138+
echo -e "${YELLOW}#${RESET} Removing old /tmp/nomad-disk-info.json bind-mount from admin volumes..."
139+
sed -i '/\/tmp\/nomad-disk-info\.json:\/app\/storage\/nomad-disk-info\.json/d' "$COMPOSE_FILE"
140+
141+
if grep -q 'nomad-disk-info\.json' "$COMPOSE_FILE"; then
142+
echo -e "${RED}#${RESET} Failed to remove old bind-mount from compose.yml. Please remove it manually:"
143+
echo -e "${WHITE_R} - /tmp/nomad-disk-info.json:/app/storage/nomad-disk-info.json${RESET}"
144+
exit 1
145+
fi
146+
147+
echo -e "${GREEN}#${RESET} Old bind-mount removed.\n"
148+
}
149+
150+
# Step 4: Add disk-collector service block
151+
add_disk_collector_service() {
152+
if grep -q 'disk-collector:' "$COMPOSE_FILE"; then
153+
echo -e "${GREEN}#${RESET} disk-collector service already present in compose.yml — skipping.\n"
154+
return 0
155+
fi
156+
157+
echo -e "${YELLOW}#${RESET} Adding disk-collector service to compose.yml..."
158+
159+
# Insert the disk-collector service block before the top-level `volumes:` key
160+
awk '/^volumes:/{
161+
print " disk-collector:"
162+
print " image: ghcr.io/crosstalk-solutions/project-nomad-disk-collector:latest"
163+
print " pull_policy: always"
164+
print " container_name: nomad_disk_collector"
165+
print " restart: unless-stopped"
166+
print " volumes:"
167+
print " - /:/host:ro,rslave # Read-only view of host FS with rslave propagation so /sys and /proc submounts are visible"
168+
print " - /opt/project-nomad/storage:/storage # Shared storage dir — disk info written here is read by the admin container"
169+
print ""
170+
}
171+
{print}' "$COMPOSE_FILE" > "${COMPOSE_FILE}.tmp" && mv "${COMPOSE_FILE}.tmp" "$COMPOSE_FILE"
172+
173+
if ! grep -q 'disk-collector:' "$COMPOSE_FILE"; then
174+
echo -e "${RED}#${RESET} Failed to add disk-collector service. Please add it manually before the top-level volumes: key."
175+
exit 1
176+
fi
177+
178+
echo -e "${GREEN}#${RESET} disk-collector service added.\n"
179+
}
180+
181+
# Step 5 — Pull new image and restart the full stack
182+
# This will re-create the admin container and drop the old /tmp bind, and
183+
# also starts the new disk-collector sidecar we just added to compose.yml
184+
restart_stack() {
185+
echo -e "${YELLOW}#${RESET} Pulling latest images (including disk-collector)..."
186+
if ! docker compose -p "$COMPOSE_PROJECT_NAME" -f "$COMPOSE_FILE" pull; then
187+
echo -e "${RED}#${RESET} Failed to pull images. Check your network connection."
188+
exit 1
189+
fi
190+
echo -e "${GREEN}#${RESET} Images pulled.\n"
191+
192+
echo -e "${YELLOW}#${RESET} Restarting stack..."
193+
if ! docker compose -p "$COMPOSE_PROJECT_NAME" -f "$COMPOSE_FILE" up -d; then
194+
echo -e "${RED}#${RESET} Failed to bring the stack up."
195+
exit 1
196+
fi
197+
echo -e "${GREEN}#${RESET} Stack restarted.\n"
198+
}
199+
200+
# Step 6: Verify
201+
verify_disk_collector_running() {
202+
sleep 3
203+
if docker ps --filter "name=^nomad_disk_collector$" --filter "status=running" --format '{{.Names}}' | grep -qx "nomad_disk_collector"; then
204+
echo -e "${GREEN}#${RESET} disk-collector container is running.\n"
205+
else
206+
echo -e "${RED}#${RESET} disk-collector container does not appear to be running."
207+
echo -e "${RED}#${RESET} Check its logs with: docker logs nomad_disk_collector"
208+
exit 1
209+
fi
210+
}
211+
212+
# Main
213+
echo -e "${GREEN}#########################################################################${RESET}"
214+
echo -e "${GREEN}#${RESET} Project N.O.M.A.D. — Disk Collector Migration Script ${GREEN}#${RESET}"
215+
echo -e "${GREEN}#########################################################################${RESET}\n"
216+
217+
check_is_bash
218+
check_has_sudo
219+
check_confirmation
220+
check_docker_running
221+
check_compose_file
222+
223+
echo -e "${YELLOW}#${RESET} Step 1: Stopping old host process...\n"
224+
stop_old_host_process
225+
226+
echo -e "${YELLOW}#${RESET} Step 2: Backing up compose.yml...\n"
227+
backup_compose_file
228+
229+
echo -e "${YELLOW}#${RESET} Step 3: Removing old bind-mount...\n"
230+
remove_old_bind_mount
231+
232+
echo -e "${YELLOW}#${RESET} Step 4: Adding disk-collector service...\n"
233+
add_disk_collector_service
234+
235+
echo -e "${YELLOW}#${RESET} Step 5: Pulling images and restarting stack...\n"
236+
restart_stack
237+
238+
echo -e "${YELLOW}#${RESET} Step 6: Verifying disk-collector is running...\n"
239+
verify_disk_collector_running
240+
241+
echo -e "${GREEN}#########################################################################${RESET}"
242+
echo -e "${GREEN}#${RESET} Migration completed successfully!"
243+
echo -e "${GREEN}#${RESET}"
244+
echo -e "${GREEN}#${RESET} The disk-collector sidecar is now running and will update disk info"
245+
echo -e "${GREEN}#${RESET} every 2 minutes. The /api/system/info endpoint will return disk data"
246+
echo -e "${GREEN}#${RESET} after the first collector write (~5 seconds after startup)."
247+
echo -e "${GREEN}#${RESET}"
248+
echo -e "${GREEN}#########################################################################${RESET}\n"

0 commit comments

Comments
 (0)