Browse Source

add

Signed-off-by: Jean-Michel Batto <jmbatto@eldarsoft.com>
Jean-Michel Batto 1 tháng trước cách đây
mục cha
commit
4a716c279f
4 tập tin đã thay đổi với 215 bổ sung0 xóa
  1. 152 0
      docker-compose.yml
  2. 27 0
      ssh/id_rsa.mpi
  3. 1 0
      ssh/id_rsa.mpi.pub
  4. 35 0
      test-batch.sh

+ 152 - 0
docker-compose.yml

@@ -0,0 +1,152 @@
+# docker swarm init, then you have to do // note version_mpinet is the name of the overlay
+# docker network create --driver=overlay --attachable yml_mpinet
+# Dec, 09, 2025 - Jean-Michel Batto
+
+services:
+  mysql:
+    image: mariadb:10.10
+    hostname: mysql
+    container_name: mysql
+    environment:
+      MARIADB_ROOT_PASSWORD: password
+      MARIADB_DATABASE: slurm_acct_db
+      MARIADB_USER: slurm
+      MARIADB_PASSWORD: password
+    volumes:
+      - var_lib_mysql:/var/lib/mysql
+    ports:
+      - "3306:3306"
+    networks:
+      - mpinet
+  slurmdbd:
+    image: jmbatto/m2chps-mpi41-slurm:latest
+    container_name: slurmdbd
+    hostname: slurmdbd
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - etc_munge:/etc/munge
+      #      - etc_slurm:/etc/slurm
+      - var_log_slurm:/var/log/slurm
+    environment:
+      - SLURMPARAM=slurmdbd
+    expose:
+      - "6819"
+    healthcheck:
+      test: ["CMD-SHELL", "netstat -tuln | grep 6819 || /bin/bash -c 'echo \"/usr/sbin/slurmdbd -Dvvv\" | grep slurmdbd'"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+      start_period: 20s
+    depends_on:
+      - mysql
+    shm_size: "512m"
+    secrets:
+      - source: "id_rsa"
+        target: "/home/mpiuser/.ssh-source/id_rsa"
+      - source: "id_rsa_mpi_pub"
+        target: "/home/mpiuser/.ssh-source/id_rsa.pub"
+      - source: "authorized_keys"
+        target: "/home/mpiuser/.ssh-source/authorized_keys"
+    networks:
+      - mpinet
+  slurmctld:
+    image: jmbatto/m2chps-mpi41-slurm:latest
+    container_name: slurmctld
+    hostname: slurmctld
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - etc_munge:/etc/munge
+      #      - etc_slurm:/etc/slurm
+      - slurm_jobdir:/data
+      - var_log_slurm:/var/log/slurm
+      - usrlocalvarmpi-foo:/usr/local/var/mpishare
+    environment:
+      - SLURMPARAM=slurmctld
+    expose:
+      - "6817"
+    depends_on:
+      slurmdbd:
+        condition: service_healthy
+    shm_size: "512m"
+    secrets:
+      - source: "id_rsa"
+        target: "/home/mpiuser/.ssh-source/id_rsa"
+      - source: "id_rsa_mpi_pub"
+        target: "/home/mpiuser/.ssh-source/id_rsa.pub"
+      - source: "authorized_keys"
+        target: "/home/mpiuser/.ssh-source/authorized_keys"
+    networks:
+      - mpinet
+  c1:
+    image: jmbatto/m2chps-mpi41-slurm:latest
+    hostname: c1
+    container_name: c1
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - etc_munge:/etc/munge
+      #      - etc_slurm:/etc/slurm
+      - slurm_jobdir:/data
+      - var_log_slurm:/var/log/slurm
+      - usrlocalvarmpi-foo:/usr/local/var/mpishare
+    environment:
+      - SLURMPARAM=slurmd
+    expose:
+      - "6818"
+    depends_on:
+      - "slurmctld"
+    shm_size: "512m"
+    secrets:
+      - source: "id_rsa"
+        target: "/home/mpiuser/.ssh-source/id_rsa"
+      - source: "id_rsa_mpi_pub"
+        target: "/home/mpiuser/.ssh-source/id_rsa.pub"
+      - source: "authorized_keys"
+        target: "/home/mpiuser/.ssh-source/authorized_keys"
+    networks:
+      - mpinet
+  c2:
+    image: jmbatto/m2chps-mpi41-slurm:latest
+    hostname: c2
+    container_name: c2
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - etc_munge:/etc/munge
+      #      - etc_slurm:/etc/slurm
+      - slurm_jobdir:/data
+      - var_log_slurm:/var/log/slurm
+      - usrlocalvarmpi-foo:/usr/local/var/mpishare
+    environment:
+      - SLURMPARAM=slurmd
+    expose:
+      - "6818"
+    depends_on:
+      - "slurmctld"
+    shm_size: "512m"
+    networks:
+      - mpinet
+    secrets:
+      - source: "id_rsa"
+        target: "/home/mpiuser/.ssh-source/id_rsa"
+      - source: "id_rsa_mpi_pub"
+        target: "/home/mpiuser/.ssh-source/id_rsa.pub"
+      - source: "authorized_keys"
+        target: "/home/mpiuser/.ssh-source/authorized_keys"
+secrets:
+  id_rsa_mpi_pub:
+    file: ssh/id_rsa.mpi.pub
+  id_rsa:
+    file: ssh/id_rsa.mpi
+  authorized_keys:
+    file: ssh/id_rsa.mpi.pub
+networks:
+  mpinet:
+    external: true
+    name: yml_mpinet
+    driver: overlay
+volumes:
+  etc_munge:
+  #  etc_slurm:
+  slurm_jobdir:
+  var_lib_mysql:
+  var_log_slurm:
+  usrlocalvarmpi-foo:

+ 27 - 0
ssh/id_rsa.mpi

@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
+1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
+O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
+36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
+mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
+bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
+OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
+TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
+79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
+YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
+mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
+lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
+rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
+DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
+44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
+fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
+cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
+g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
+yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
+PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
+v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
+hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
+sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
+zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
+yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
+-----END RSA PRIVATE KEY-----

+ 1 - 0
ssh/id_rsa.mpi.pub

@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi

+ 35 - 0
test-batch.sh

@@ -0,0 +1,35 @@
+#!/bin/bash
+#SBATCH --job-name=test_mpi       # Nom du job
+#SBATCH --partition=docker        # Nom de la partition (défini dans votre slurm.conf)
+#SBATCH --nodes=2                 # Demande 2 noeuds (c1 et c2)
+#SBATCH --ntasks=2                # Demande 2 tâches au total
+#SBATCH --ntasks-per-node=1       # 1 tâche par noeud (pour forcer la répartition)
+#SBATCH --output=res_%j.out       # Fichier de sortie standard (%j = ID du job)
+#SBATCH --error=res_%j.err        # Fichier d'erreur
+
+echo "=========================================="
+echo "Job SLURM ID : $SLURM_JOB_ID"
+echo "Date de début : $(date)"
+echo "Exécuté sur le noeud maître : $(hostname)"
+echo "Liste des noeuds alloués : $SLURM_JOB_NODELIST"
+echo "=========================================="
+
+echo ""
+echo ">>> TEST 1 : Vérification simple des noeuds (srun hostname)"
+# Doit retourner c1 et c2
+srun hostname
+
+echo ""
+echo ">>> TEST 2 : Test MPI via Python (mpi4py)"
+# Ce script Python va afficher le rang MPI et le nom de l'hôte
+# Votre slurm.conf utilise MpiDefault=pmi2, donc srun devrait gérer la communication
+srun python3 -c "from mpi4py import MPI; \
+comm = MPI.COMM_WORLD; \
+rank = comm.Get_rank(); \
+size = comm.Get_size(); \
+host = MPI.Get_processor_name(); \
+print(f'MPI SUCCESS: Je suis le rang {rank} sur {size}, tournant sur le conteneur {host}')"
+
+echo ""
+echo "=========================================="
+echo "Fin du job : $(date)"