Inconsistent performance:
jobs 946956-58
19.267, 30.403, 50.911
jobs #947618-20
44.807, 28.639, 45.942
jobs 948067-69
22.320 , 33.170 , 45.191
Without -update gpu:
27.690 , 29.114 , 30.437, 20.010, 33.940, 28.225, 31.716 , 27.610,
Set constraint algorithm to LINCS because -update gpu is not supported with SHAKE
10000 step is not enough for reliable timing, becuse gromacs reports WALL TIME, not CPU TIME.
I will submit another benchmark done with 400,000 steps.
---------8><---------><8----------
#!/bin/bash
#SBATCH --mem-per-cpu=2000 --time=1:0:0 -c12 --ntasks=1 --gpus-per-node=1 --nodes=1 -A def-svassili
export OMP_NUM_THREADS="${SLURM_CPUS_PER_TASK:-1}"
module load StdEnv/2020 gcc/9.3.0 cuda/11.4 openmpi/4.0.3 gromacs/2021.4
gmx mdrun -ntomp $OMP_NUM_THREADS -nb gpu -pme gpu -bonded gpu -update gpu -s topol.tpr -cpi state.cpi
grep Brand md.log
grep -A1 "Number of GPUs detected:" md.log | tail -n1
grep "The number of OpenMP threads" md.log
grep Performance: md.log
title = benchmark
; Run parameters
integrator = md
nsteps = 400000
dt = 0.001
; Output control
nstxout = 0
nstvout = 0
nstfout = 0
nstenergy = 10000
nstlog = 10000
nstxout-compressed = 50000
compressed-x-grps = System
; Bond parameters
continuation = yes
constraint_algorithm = lincs
constraints = h-bonds
; Neighborsearching
cutoff-scheme = Verlet
ns_type = grid
nstlist = 10
rcoulomb = 0.8
rvdw = 0.8
DispCorr = Ener ; anaytic VDW correction
; Electrostatics
coulombtype = PME
pme_order = 4
fourier-nx = 144
fourier-ny = 144
fourier-nz = 144
; Temperature coupling is on
tcoupl = V-rescale
tc-grps = system
tau_t = 0.1
ref_t = 300
; Pressure coupling is on
pcoupl = Parrinello-Rahman
pcoupltype = isotropic
tau_p = 2.0
ref_p = 1.0
compressibility = 4.5e-5
; Periodic boundary conditions
pbc = xyz
; Velocity generation
gen_vel = no