Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
PPC-BSC
Software
Deep Health Compss
Commits
834cd5bb
Commit
834cd5bb
authored
Feb 22, 2022
by
salbiach
Browse files
Multi GPU support and MN scripts
parent
ca5b0161
Changes
11
Hide whitespace changes
Inline
Side-by-side
docker/Dockerfile
View file @
834cd5bb
...
...
@@ -41,13 +41,19 @@ RUN set -x && \
WORKDIR
/root
RUN
mkdir
pyeddl
# Creating environment and activating it for next dockerfile runs
RUN
conda create
--name
"pyeddl_pycompss_env"
python
=
3.6
.15
RUN
conda create
--name
"pyeddl_pycompss_env"
python
=
3.6
SHELL
["conda", "run", "-n", "pyeddl_pycompss_env", "/bin/bash", "-c"]
# pyeddl installation
#RUN conda config --add channels dhealth && \
# conda config --add channels conda-forge && \
# conda config --set channel_priority strict && \
# conda install pyeddl-gpu==1.1.0
RUN
conda config
--add
channels dhealth
&&
\
conda config
--add
channels bioconda
&&
\
conda config
--add
channels conda-forge
&&
\
conda config
--set
channel_priority strict
&&
\
conda
install
pye
dd
l-gpu
==
1.1.0
conda
install
pye
cv
l-gpu
# pycompss intallation
RUN
pip
install
pycompss
==
2.8
dislib
===
0.6.4 dill
# Few useful utils for users
...
...
docker/Makefile
View file @
834cd5bb
...
...
@@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TAG
=
latest-gpu
#TAG = ecvl-gpu-test
PREFIX
=
registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss
IMAGE
=
compss-eddl
...
...
docker/compss/distrib_executions.sh
0 → 100644
View file @
834cd5bb
#!/bin/bash
#source get_pods_ip.sh
#masterIP=(${Nodes[2]})
echo
"MasterIP is:"
$MY_POD_IP
cd
pyeddl
DATASET
=
"cifar10"
NETWORK
=
"lenet"
NUM_EPOCHS
=
10
NUM_WORKERS
=
1
SYNC_TYPE
=
0
NUM_GPU
=
1
RUNS
=
5
BASE_FILE
=
"/root/exec_out/gpu
${
NUM_GPU
}
/"
mkdir
-p
$BASE_FILE
for
i
in
$(
seq
1
${
RUNS
}
)
do
conda run
--no-capture-output
-n
pyeddl_pycompss_env runcompss
--lang
=
python
--python_interpreter
=
python3
\
--project
=
/root/project.xml
--resources
=
/root/resources.xml
--master_name
=
$MY_POD_IP
\
eddl_master_train_batch.py
--dataset
=
${
DATASET
}
--network
=
${
NETWORK
}
--num_epochs
=
${
NUM_EPOCHS
}
\
--num_workers
=
${
NUM_WORKERS
}
--sync_type
=
${
SYNC_TYPE
}
\
>
${
BASE_FILE
}
distrib_
${
DATASET
}
_
${
NETWORK
}
_gpu_
${
NUM_GPU
}
_sync_
${
SYNC_TYPE
}
_workers_
${
NUM_WORKERS
}
_r
${
i
}
.out
done
docker/compss/get_pods_ip.sh
View file @
834cd5bb
...
...
@@ -20,4 +20,4 @@ Nodes=$(curl --cacert ${CACERT} --header "Authorization: Bearer ${TOKEN}" -X GET
"
${
APISERVER
}
/api/v1/namespaces/
${
NAMESPACE
}
/endpoints/"
| jq
-rM
".items[].subsets[].addresses[].ip"
| xargs
echo
)
echo
"Pods IP's are:"
echo
$Nodes
\ No newline at end of file
echo
$Nodes
docker/compss/mn_distrib.sh
0 → 100644
View file @
834cd5bb
#!/bin/bash -e
#SBATCH --ntasks=1
#SBATCH --error=exec_out/distrib_%j.err
#SBATCH --output=exec_out/distrib_%j.out
# Define application variables
exec_file
=
$(
pwd
)
/eddl_master_train_batch.py
DATASET
=
"cifar10"
NETWORK
=
"lenet"
NUM_EPOCHS
=
10
RUNS
=
3
#qos_flag="--qos=debug"
qos_flag
=
""
CPUS_PER_NODE
=
48
WORKER_IN_MASTER_CPUS
=
0
execution_time
=
10
#in minutes
graph
=
$tracing
export
OMP_NUM_THREADS
=
48
for
w
in
2
do
for
s
in
0 1 2
do
for
i
in
$(
seq
1
${
RUNS
}
)
do
NUM_WORKERS
=
${
w
}
SYNC_TYPE
=
${
s
}
num_nodes
=
`
expr
${
NUM_WORKERS
}
+ 1
`
# Enqueue job
enqueue_compss
--sc_cfg
=
mn.cfg
\
--job_name
=
"eddl_w
${
w
}
_s
${
s
}
_r
${
i
}
"
\
--num_nodes
=
"
${
num_nodes
}
"
\
--cpus_per_node
=
"
${
CPUS_PER_NODE
}
"
\
--cpus_per_task
\
--worker_in_master_cpus
=
"
${
WORKER_IN_MASTER_CPUS
}
"
\
--exec_time
=
"
${
execution_time
}
"
\
--scheduler
=
es.bsc.compss.scheduler.loadbalancing.LoadBalancingScheduler
\
--worker_working_dir
=
/home/bsc37/bsc37726/projects/compss/
\
"
${
qos_flag
}
"
\
--lang
=
python
\
--python_interpreter
=
"python3"
\
"
$exec_file
"
--dataset
=
${
DATASET
}
--network
=
${
NETWORK
}
--num_workers
=
${
NUM_WORKERS
}
--num_epochs
=
${
NUM_EPOCHS
}
--sync_type
=
${
SYNC_TYPE
}
>
exec_out/distrib_
${
DATASET
}
_
${
NETWORK
}
_sync_
${
SYNC_TYPE
}
_workers_
${
NUM_WORKERS
}
.out
done
done
done
docker/compss/mn_sequential.sh
0 → 100644
View file @
834cd5bb
#!/bin/bash
#SBATCH --job-name="sequential_cifar10_lenet"
#SBATCH --workdir=.
#SBATCH --output=exec_out/sequential_%j.out
#SBATCH --error=exec_out/sequential_%j.err
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --time=00:60:00
#NUM_WORKERS=1
#SYNC_TYPE=0
#DATASET="cifar10"
#NETWORK="lenet"
#NUM_EPOCHS=10
#python3 -u eddl_master_train_batch.py --dataset=${DATASET} --network=${NETWORK} --num_workers=${NUM_WORKERS} --num_epochs=${NUM_EPOCHS} --sync_type=${SYNC_TYPE} > exec_out/sequential_${DATASET}_${NETWORK}_sync_${SYNC_TYPE}_r1.out
DATASET
=
"cifar10"
NETWORK
=
"lenet"
NUM_EPOCHS
=
10
NUM_WORKERS
=
1
SYNC_TYPE
=
0
RUNS
=
1
BASE_FILE
=
"/home/bsc37/bsc37726/projects/compss/exec_out/"
mkdir
-p
$BASE_FILE
export
OMP_NUM_THREADS
=
48
for
i
in
$(
seq
1
${
RUNS
}
)
do
python3
-u
\
eddl_master_train_batch.py
--dataset
=
${
DATASET
}
--network
=
${
NETWORK
}
--num_epochs
=
${
NUM_EPOCHS
}
\
--num_workers
=
${
NUM_WORKERS
}
--sync_type
=
${
SYNC_TYPE
}
\
>
${
BASE_FILE
}
sequential_
${
DATASET
}
_
${
NETWORK
}
_sync_
${
SYNC_TYPE
}
_r
${
i
}
.out
done
docker/compss/sequential_executions.sh
0 → 100644
View file @
834cd5bb
#!/bin/bash
#source get_pods_ip.sh
#masterIP=(${Nodes[2]})
echo
"MasterIP is:"
$MY_POD_IP
cd
pyeddl
DATASET
=
"cifar10"
NETWORK
=
"lenet"
NUM_EPOCHS
=
10
NUM_WORKERS
=
1
SYNC_TYPE
=
0
NUM_GPU
=
1
RUNS
=
5
BASE_FILE
=
"/root/exec_out/gpu
${
NUM_GPU
}
/"
mkdir
-p
$BASE_FILE
for
i
in
$(
seq
1
${
RUNS
}
)
do
conda run
--no-capture-output
-n
pyeddl_pycompss_env python
-u
\
eddl_master_train_batch.py
--dataset
=
${
DATASET
}
--network
=
${
NETWORK
}
--num_epochs
=
${
NUM_EPOCHS
}
\
--num_workers
=
${
NUM_WORKERS
}
--sync_type
=
${
SYNC_TYPE
}
\
>
${
BASE_FILE
}
sequential_
${
DATASET
}
_
${
NETWORK
}
_gpu_
${
NUM_GPU
}
_sync_
${
SYNC_TYPE
}
_r
${
i
}
.out
done
docker/pyeddl/eddl_master_distributed_api.py
View file @
834cd5bb
...
...
@@ -18,13 +18,13 @@ from timeit import default_timer as timer
compss_object
:
Eddl_Compss_Distributed
=
None
def
build
(
net
,
dataset
,
network
,
use_gpu
):
def
build
(
net
,
dataset
,
network
,
use_gpu
,
num_gpu
):
# Initialize the compss object
global
compss_object
compss_object
=
Eddl_Compss_Distributed
()
# Define the computing service to use
CS
=
eddl
.
CS_GPU
()
if
use_gpu
else
eddl
.
CS_CPU
()
CS
=
eddl
.
CS_GPU
(
num_gpu
)
if
use_gpu
else
eddl
.
CS_CPU
()
# Build the model in the master
eddl
.
build
(
...
...
@@ -37,7 +37,7 @@ def build(net, dataset, network, use_gpu):
)
# Build the model in each distributed computing unit
compss_object
.
build
(
dataset
,
network
,
use_gpu
)
compss_object
.
build
(
dataset
,
network
,
use_gpu
,
num_gpu
)
# Wait until the models are created in each computing unit
print
(
"Building the model in distributed computing units..."
)
...
...
@@ -45,7 +45,7 @@ def build(net, dataset, network, use_gpu):
print
(
"Building done!"
)
def
fit_sync
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
):
def
fit_sync
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
,
num_gpu
):
"""
Synchronization every epoch
"""
...
...
@@ -76,7 +76,7 @@ def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
block_y
,
model_params
,
num_images_per_worker
,
workers_batch_size
,
use_gpu
)
workers_batch_size
,
use_gpu
,
num_gpu
)
# Wait until every computing unit finishes its training (synchronous training)
worker_params
=
compss_wait_on
(
worker_params
)
...
...
@@ -91,7 +91,7 @@ def fit_sync(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
return
model_params
def
fit_async
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
):
def
fit_async
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
,
num_gpu
):
"""
Partial parameter aggregation after every worker completion
"""
...
...
@@ -120,7 +120,7 @@ def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
block_y
,
worker_params
[
j
],
num_images_per_worker
,
workers_batch_size
,
use_gpu
)
workers_batch_size
,
use_gpu
,
num_gpu
)
# model_params is COMMUTATIVE therefore it is updating in each call
worker_params
[
j
]
=
compss_object
.
aggregate_parameters_async
(
model_params
,
worker_params
[
j
],
(
1
/
num_workers
))
...
...
@@ -131,7 +131,7 @@ def fit_async(model_params, x_train_dist, y_train_dist, num_workers, num_epochs,
return
model_params
def
fit_full_async
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
):
def
fit_full_async
(
model_params
,
x_train_dist
,
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
,
num_gpu
):
"""
Parameter aggregation at the end of num_epochs only
"""
...
...
@@ -162,7 +162,7 @@ def fit_full_async(model_params, x_train_dist, y_train_dist, num_workers, num_ep
block_y
,
worker_params
[
j
],
num_images_per_worker
,
workers_batch_size
,
use_gpu
)
workers_batch_size
,
use_gpu
,
num_gpu
)
# Wait until every computing unit finishes its training (synchronous training)
...
...
docker/pyeddl/eddl_master_train_batch.py
View file @
834cd5bb
...
...
@@ -30,6 +30,7 @@ def main(args):
dataset
=
args
.
dataset
network
=
args
.
network
use_gpu
=
args
.
gpu
num_gpu
=
[
1
for
i
in
range
(
args
.
num_gpu
)]
sync_type
=
args
.
sync_type
# Define available datasets and network implementations
...
...
@@ -72,7 +73,7 @@ def main(args):
##########################
##### MODEL BUILDING #####
##########################
compss_api
.
build
(
net
,
dataset
,
network
,
use_gpu
)
compss_api
.
build
(
net
,
dataset
,
network
,
use_gpu
,
num_gpu
)
eddl
.
summary
(
net
)
...
...
@@ -113,7 +114,7 @@ def main(args):
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
)
use_gpu
,
num_gpu
)
elif
(
sync_type
==
1
):
model_params
=
compss_api
.
fit_async
(
...
...
@@ -122,7 +123,8 @@ def main(args):
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
)
workers_batch_size
,
use_gpu
,
num_gpu
)
elif
(
sync_type
==
2
):
model_params
=
compss_api
.
fit_full_async
(
...
...
@@ -131,7 +133,8 @@ def main(args):
y_train_dist
,
num_workers
,
num_epochs
,
workers_batch_size
,
use_gpu
)
workers_batch_size
,
use_gpu
,
num_gpu
)
else
:
print
(
"No such sync type option available"
)
...
...
@@ -158,6 +161,8 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--num_epochs"
,
type
=
int
,
metavar
=
"INT"
,
default
=
10
)
# Number of epochs to run the training
parser
.
add_argument
(
"--workers_batch_size"
,
type
=
int
,
metavar
=
"INT"
,
default
=
250
)
# Size of each batch of the training phase
parser
.
add_argument
(
"--gpu"
,
type
=
bool
,
metavar
=
"BOOL"
,
default
=
False
)
# True: Use GPU as CS --- False: Use CPU as CS
parser
.
add_argument
(
"--num_gpu"
,
type
=
int
,
metavar
=
"INT"
,
default
=
1
)
# Number of GPUs per node
parser
.
add_argument
(
"--sync_type"
,
type
=
int
,
metavar
=
"INT"
,
default
=
0
)
# 0: synchronous --- 1: asynchronous --- 2: fully asynchronous
main
(
parser
.
parse_args
(
sys
.
argv
[
1
:]))
docker/pyeddl/eddl_worker_distributed.py
View file @
834cd5bb
...
...
@@ -22,7 +22,7 @@ class Eddl_Compss_Distributed:
@
constraint
(
computing_units
=
"${OMP_NUM_THREADS}"
)
@
task
(
dataset
=
IN
,
network
=
IN
,
use_gpu
=
IN
,
is_replicated
=
True
)
def
build
(
self
,
dataset
,
network
,
use_gpu
):
def
build
(
self
,
dataset
,
network
,
use_gpu
,
num_gpu
):
# Dictionary relating the dataset with its number of classes and the first layer of the associated network
dataset_network_dict
=
{
"mnist"
:
[
10
,
{
...
...
@@ -47,7 +47,7 @@ class Eddl_Compss_Distributed:
net
=
eddl
.
Model
([
in_
],
[
out
])
# Define the computing service to use
CS
=
eddl
.
CS_GPU
()
if
use_gpu
else
eddl
.
CS_CPU
()
CS
=
eddl
.
CS_GPU
(
num_gpu
)
if
use_gpu
else
eddl
.
CS_CPU
()
# Build the model in this very node
eddl
.
build
(
...
...
@@ -78,7 +78,7 @@ class Eddl_Compss_Distributed:
model_params
,
num_images_per_worker
,
workers_batch_size
,
use_gpu
):
use_gpu
,
num_gpu
):
# Convert data to tensors
x_train
=
to_tensor
(
x_train
)
...
...
@@ -88,7 +88,7 @@ class Eddl_Compss_Distributed:
model
=
eddl
.
import_net_from_onnx_string
(
self
.
model
)
# Define the computing service to use
CS
=
eddl
.
CS_GPU
()
if
use_gpu
else
eddl
.
CS_CPU
()
CS
=
eddl
.
CS_GPU
(
num_gpu
)
if
use_gpu
else
eddl
.
CS_CPU
()
# Build the model after deserializing and before injecting the parameters
eddl
.
build
(
...
...
kubernetes/compss_deephealth.yaml
View file @
834cd5bb
...
...
@@ -31,7 +31,7 @@ spec:
image
:
registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss/compss-eddl:latest-gpu
resources
:
limits
:
nvidia.com/gpu
:
1
# requesting 1 GPU
nvidia.com/gpu
:
2
# requesting 1 GPU
imagePullPolicy
:
Always
ports
:
-
containerPort
:
22
...
...
@@ -66,7 +66,7 @@ spec:
selector
:
matchLabels
:
app
:
compss
replicas
:
16
replicas
:
2
template
:
metadata
:
labels
:
...
...
@@ -87,7 +87,7 @@ spec:
image
:
registry.gitlab.bsc.es/ppc-bsc/software/deep-health-compss/compss-eddl:latest-gpu
resources
:
limits
:
nvidia.com/gpu
:
1
# requesting 1 GPU
nvidia.com/gpu
:
2
# requesting 1 GPU
imagePullPolicy
:
Always
# env:
# - name: MY_POD_NAME
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment