Commit 834cd5bb authored by salbiach's avatar salbiach
Browse files

Multi GPU support and MN scripts

parent ca5b0161
......@@ -41,13 +41,19 @@ RUN set -x && \
WORKDIR /root
RUN mkdir pyeddl
# Creating environment and activating it for next dockerfile runs
RUN conda create --name "pyeddl_pycompss_env" python=3.6.15
RUN conda create --name "pyeddl_pycompss_env" python=3.6
SHELL ["conda", "run", "-n", "pyeddl_pycompss_env", "/bin/bash", "-c"]
# pyeddl installation
#RUN conda config --add channels dhealth && \
# conda config --add channels conda-forge && \
# conda config --set channel_priority strict && \
# conda install pyeddl-gpu==1.1.0
RUN conda config --add channels dhealth && \
conda config --add channels bioconda && \
conda config --add channels conda-forge && \
conda config --set channel_priority strict && \
conda install pyeddl-gpu==1.1.0
conda install pyecvl-gpu
# pycompss intallation
RUN pip install pycompss==2.8 dislib===0.6.4 dill
# Few useful utils for users
......
......@@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TAG = latest-gpu
#TAG = ecvl-gpu-test
PREFIX = registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss
IMAGE = compss-eddl
......
#!/bin/bash
#source get_pods_ip.sh
#masterIP=(${Nodes[2]})
echo "MasterIP is:" $MY_POD_IP
cd pyeddl
DATASET="cifar10"
NETWORK="lenet"
NUM_EPOCHS=10
NUM_WORKERS=1
SYNC_TYPE=0
NUM_GPU=1
RUNS=5
BASE_FILE="/root/exec_out/gpu${NUM_GPU}/"
mkdir -p $BASE_FILE
for i in $( seq 1 ${RUNS} )
do
conda run --no-capture-output -n pyeddl_pycompss_env runcompss --lang=python --python_interpreter=python3 \
--project=/root/project.xml --resources=/root/resources.xml --master_name=$MY_POD_IP \
eddl_master_train_batch.py --dataset=${DATASET} --network=${NETWORK} --num_epochs=${NUM_EPOCHS} \
--num_workers=${NUM_WORKERS} --sync_type=${SYNC_TYPE} \
> ${BASE_FILE}distrib_${DATASET}_${NETWORK}_gpu_${NUM_GPU}_sync_${SYNC_TYPE}_workers_${NUM_WORKERS}_r${i}.out
done
......@@ -20,4 +20,4 @@ Nodes=$(curl --cacert ${CACERT} --header "Authorization: Bearer ${TOKEN}" -X GET
"${APISERVER}/api/v1/namespaces/${NAMESPACE}/endpoints/" | jq -rM ".items[].subsets[].addresses[].ip" | xargs echo)
echo "Pods IP's are:"
echo $Nodes
\ No newline at end of file
echo $Nodes
#!/bin/bash -e
#SBATCH --ntasks=1
#SBATCH --error=exec_out/distrib_%j.err
#SBATCH --output=exec_out/distrib_%j.out
# Define application variables
exec_file=$(pwd)/eddl_master_train_batch.py
DATASET="cifar10"
NETWORK="lenet"
NUM_EPOCHS=10
RUNS=3
#qos_flag="--qos=debug"
qos_flag=""
CPUS_PER_NODE=48
WORKER_IN_MASTER_CPUS=0
execution_time=10 #in minutes
graph=$tracing
export OMP_NUM_THREADS=48
for w in 2
do
for s in 0 1 2
do
for i in $( seq 1 ${RUNS} )
do
NUM_WORKERS=${w}
SYNC_TYPE=${s}
num_nodes=`expr ${NUM_WORKERS} + 1`
# Enqueue job
enqueue_compss --sc_cfg=mn.cfg \
--job_name="eddl_w${w}_s${s}_r${i}" \
--num_nodes="${num_nodes}" \
--cpus_per_node="${CPUS_PER_NODE}" \
--cpus_per_task \
--worker_in_master_cpus="${WORKER_IN_MASTER_CPUS}" \
--exec_time="${execution_time}" \
--scheduler=es.bsc.compss.scheduler.loadbalancing.LoadBalancingScheduler \
--worker_working_dir=/home/bsc37/bsc37726/projects/compss/ \
"${qos_flag}" \
--lang=python \
--python_interpreter="python3" \
"$exec_file" --dataset=${DATASET} --network=${NETWORK} --num_workers=${NUM_WORKERS} --num_epochs=${NUM_EPOCHS} --sync_type=${SYNC_TYPE} > exec_out/distrib_${DATASET}_${NETWORK}_sync_${SYNC_TYPE}_workers_${NUM_WORKERS}.out
done
done
done
#!/bin/bash
#SBATCH --job-name="sequential_cifar10_lenet"
#SBATCH --workdir=.
#SBATCH --output=exec_out/sequential_%j.out
#SBATCH --error=exec_out/sequential_%j.err
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --time=00:60:00
#NUM_WORKERS=1
#SYNC_TYPE=0
#DATASET="cifar10"
#NETWORK="lenet"
#NUM_EPOCHS=10
#python3 -u eddl_master_train_batch.py --dataset=${DATASET} --network=${NETWORK} --num_workers=${NUM_WORKERS} --num_epochs=${NUM_EPOCHS} --sync_type=${SYNC_TYPE} > exec_out/sequential_${DATASET}_${NETWORK}_sync_${SYNC_TYPE}_r1.out
DATASET="cifar10"
NETWORK="lenet"
NUM_EPOCHS=10
NUM_WORKERS=1
SYNC_TYPE=0
RUNS=1
BASE_FILE="/home/bsc37/bsc37726/projects/compss/exec_out/"
mkdir -p $BASE_FILE
export OMP_NUM_THREADS=48
for i in $( seq 1 ${RUNS} )
do
python3 -u \
eddl_master_train_batch.py --dataset=${DATASET} --network=${NETWORK} --num_epochs=${NUM_EPOCHS} \
--num_workers=${NUM_WORKERS} --sync_type=${SYNC_TYPE} \
> ${BASE_FILE}sequential_${DATASET}_${NETWORK}_sync_${SYNC_TYPE}_r${i}.out
done
#!/bin/bash
#source get_pods_ip.sh
#masterIP=(${Nodes[2]})
echo "MasterIP is:" $MY_POD_IP
cd pyeddl
DATASET="cifar10"
NETWORK="lenet"
NUM_EPOCHS=10
NUM_WORKERS=1
SYNC_TYPE=0
NUM_GPU=1
RUNS=5
BASE_FILE="/root/exec_out/gpu${NUM_GPU}/"
mkdir -p $BASE_FILE
for i in $( seq 1 ${RUNS} )
do
conda run --no-capture-output -n pyeddl_pycompss_env python -u \
eddl_master_train_batch.py --dataset=${DATASET} --network=${NETWORK} --num_epochs=${NUM_EPOCHS} \
--num_workers=${NUM_WORKERS} --sync_type=${SYNC_TYPE} \
> ${BASE_FILE}sequential_${DATASET}_${NETWORK}_gpu_${NUM_GPU}_sync_${SYNC_TYPE}_r${i}.out
done
......@@ -18,13 +18,13 @@ from timeit import default_timer as timer
compss_object: Eddl_Compss_Distributed = None
def build(net, dataset, network, use_gpu):
def build(net, dataset, network, use_gpu, num_gpu):
# Initialize the compss object
global compss_object
compss_object = Eddl_Compss_Distributed()
# Define the computing service to use
CS = eddl.CS_GPU() if use_gpu else eddl.CS_CPU()
CS = eddl.CS_GPU(num_gpu) if use_gpu else eddl.CS_CPU()
# Build the model in the master
eddl.build(
......@@ -37,7 +37,7 @@ def build(net, dataset, network, use_gpu):
)
# Build the model in each distributed computing unit
compss_object.build(dataset, network, use_gpu)
compss_object.build(dataset, network, use_gpu, num_gpu)
# Wait until the models are created in each computing unit
print("Building the model in distributed computing units...")
......@@ -45,7 +45,7 @@ def build(net, dataset, network, use_gpu):
print("Building done!")
def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu):
def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu, num_gpu):
"""
Synchronization every epoch
"""
......@@ -76,7 +76,7 @@ def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
block_y,
model_params,
num_images_per_worker,
workers_batch_size, use_gpu)
workers_batch_size, use_gpu, num_gpu)
# Wait until every computing unit finishes its training (synchronous training)
worker_params = compss_wait_on(worker_params)
......@@ -91,7 +91,7 @@ def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
return model_params
def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu):
def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu, num_gpu):
"""
Partial parameter aggregation after every worker completion
"""
......@@ -120,7 +120,7 @@ def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
block_y,
worker_params[j],
num_images_per_worker,
workers_batch_size, use_gpu)
workers_batch_size, use_gpu, num_gpu)
# model_params is COMMUTATIVE therefore it is updating in each call
worker_params[j] = compss_object.aggregate_parameters_async( model_params, worker_params[j], (1 / num_workers))
......@@ -131,7 +131,7 @@ def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
return model_params
def fit_full_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu):
def fit_full_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs, workers_batch_size, use_gpu, num_gpu):
"""
Parameter aggregation at the end of num_epochs only
"""
......@@ -162,7 +162,7 @@ def fit_full_async(model_params, x_train_dist, y_train_dist, num_workers, num_ep
block_y,
worker_params[j],
num_images_per_worker,
workers_batch_size, use_gpu)
workers_batch_size, use_gpu, num_gpu)
# Wait until every computing unit finishes its training (synchronous training)
......
......@@ -30,6 +30,7 @@ def main(args):
dataset = args.dataset
network = args.network
use_gpu = args.gpu
num_gpu = [1 for i in range(args.num_gpu)]
sync_type = args.sync_type
# Define available datasets and network implementations
......@@ -72,7 +73,7 @@ def main(args):
##########################
##### MODEL BUILDING #####
##########################
compss_api.build(net, dataset, network, use_gpu)
compss_api.build(net, dataset, network, use_gpu, num_gpu)
eddl.summary(net)
......@@ -113,7 +114,7 @@ def main(args):
num_workers,
num_epochs,
workers_batch_size,
use_gpu)
use_gpu, num_gpu)
elif (sync_type == 1):
model_params =compss_api.fit_async(
......@@ -122,7 +123,8 @@ def main(args):
y_train_dist,
num_workers,
num_epochs,
workers_batch_size, use_gpu)
workers_batch_size,
use_gpu, num_gpu)
elif (sync_type == 2):
model_params =compss_api.fit_full_async(
......@@ -131,7 +133,8 @@ def main(args):
y_train_dist,
num_workers,
num_epochs,
workers_batch_size, use_gpu)
workers_batch_size,
use_gpu, num_gpu)
else:
print("No such sync type option available")
......@@ -158,6 +161,8 @@ if __name__ == "__main__":
parser.add_argument("--num_epochs", type=int, metavar="INT", default=10) # Number of epochs to run the training
parser.add_argument("--workers_batch_size", type=int, metavar="INT", default=250) # Size of each batch of the training phase
parser.add_argument("--gpu", type=bool, metavar="BOOL", default=False) # True: Use GPU as CS --- False: Use CPU as CS
parser.add_argument("--num_gpu", type=int, metavar="INT", default=1) # Number of GPUs per node
parser.add_argument("--sync_type", type=int, metavar="INT", default=0) # 0: synchronous --- 1: asynchronous --- 2: fully asynchronous
main(parser.parse_args(sys.argv[1:]))
......@@ -22,7 +22,7 @@ class Eddl_Compss_Distributed:
@constraint(computing_units="${OMP_NUM_THREADS}")
@task(dataset=IN,network=IN, use_gpu=IN, is_replicated=True)
def build(self, dataset, network, use_gpu):
def build(self, dataset, network, use_gpu, num_gpu):
# Dictionary relating the dataset with its number of classes and the first layer of the associated network
dataset_network_dict = {"mnist": [10, {
......@@ -47,7 +47,7 @@ class Eddl_Compss_Distributed:
net = eddl.Model([in_], [out])
# Define the computing service to use
CS = eddl.CS_GPU() if use_gpu else eddl.CS_CPU()
CS = eddl.CS_GPU(num_gpu) if use_gpu else eddl.CS_CPU()
# Build the model in this very node
eddl.build(
......@@ -78,7 +78,7 @@ class Eddl_Compss_Distributed:
model_params,
num_images_per_worker,
workers_batch_size,
use_gpu):
use_gpu, num_gpu):
# Convert data to tensors
x_train = to_tensor(x_train)
......@@ -88,7 +88,7 @@ class Eddl_Compss_Distributed:
model = eddl.import_net_from_onnx_string(self.model)
# Define the computing service to use
CS = eddl.CS_GPU() if use_gpu else eddl.CS_CPU()
CS = eddl.CS_GPU(num_gpu) if use_gpu else eddl.CS_CPU()
# Build the model after deserializing and before injecting the parameters
eddl.build(
......
......@@ -31,7 +31,7 @@ spec:
image: registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss/compss-eddl:latest-gpu
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
nvidia.com/gpu: 2 # requesting 1 GPU
imagePullPolicy: Always
ports:
- containerPort: 22
......@@ -66,7 +66,7 @@ spec:
selector:
matchLabels:
app: compss
replicas: 16
replicas: 2
template:
metadata:
labels:
......@@ -87,7 +87,7 @@ spec:
image: registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss/compss-eddl:latest-gpu
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
nvidia.com/gpu: 2 # requesting 1 GPU
imagePullPolicy: Always
# env:
# - name: MY_POD_NAME
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment