Commit a72e98cf authored by salbiach's avatar salbiach
Browse files

Less prints and eval

parent a0dc0491
......@@ -20,12 +20,12 @@ class Eddl_Compss_Distributed:
@task(serialized_model=IN, optimizer=IN, losses=IN, metrics=IN, compserv=IN, is_replicated=True)
def build(self, serialized_model, optimizer, losses, metrics, compserv):
print("Arranca build task en worker")
#print("Arranca build task en worker")
# Deserialize the received model
model = eddl.import_net_from_onnx_string(serialized_model)
# print(eddl.summary(model))
#print(eddl.summary(model))
# Build the model in this very node
eddl.build(
......@@ -40,7 +40,7 @@ class Eddl_Compss_Distributed:
# Save the model. We have to serialize it to a string so COMPSs is able to serialize and deserialize from disk
self.model = eddl.serialize_net_to_onnx_string(model, False)
print("Finaliza build task en worker")
#print("Finaliza build task en worker")
@constraint(computing_units="${OMP_NUM_THREADS}")
@task(
......@@ -63,14 +63,14 @@ class Eddl_Compss_Distributed:
x_train = to_tensor(x_train)
y_train = to_tensor(y_train)
print("Entrando en train batch task en worker")
#print("Entrando en train batch task en worker")
import sys
sys.stdout.flush()
# Deserialize from disk
model = eddl.import_net_from_onnx_string(self.model)
print("Modelo deserializado de disco")
#print("Modelo deserializado de disco")
sys.stdout.flush()
# The model needs to be built after deserializing and before injecting the parameters
......@@ -83,41 +83,42 @@ class Eddl_Compss_Distributed:
False
)
print("Modelo built")
#print("Modelo built")
sys.stdout.flush()
# Set the parameters sent from master to the model
#model.setParameters(net_parametersToTensor(initial_parameters))
eddl.set_parameters(model, net_parametersToTensor(initial_parameters))
print("Parametros seteados")
#print("Parametros seteados")
sys.stdout.flush()
# print(eddl.summary(model))
#print(eddl.summary(model))
print("Build completed in train batch task")
sys.stdout.flush()
#print("Build completed in train batch task")
#sys.stdout.flush()
num_batches = int(num_images_per_worker / workers_batch_size)
eddl.reset_loss(model)
print("Num batches: ", num_batches)
sys.stdout.flush()
#print("Num batches: ", num_batches)
#sys.stdout.flush()
for i in range(num_epochs_for_param_sync):
print("Epoch %d/%d (%d batches)" % (i + 1, num_epochs_for_param_sync, num_batches))
#print("Epoch %d/%d (%d batches)" % (i + 1, num_epochs_for_param_sync, num_batches))
for j in range(num_batches):
indices = list(range(j * num_batches, (j + 1) * num_batches - 1))
print("Empieza batch")
sys.stdout.flush()
#print("Empieza batch")
#sys.stdout.flush()
eddl.train_batch(model, [x_train], [y_train], indices)
print("Finaliza batch")
sys.stdout.flush()
eddl.print_loss(model, j)
print()
print("Train batch individual completed in train batch task")
#print("Finaliza batch")
#sys.stdout.flush()
eddl.print_loss(model, num_batches) # previously j because it was inside the loop
print()
print("Train batch individual completed in worker's train batch task")
print()
sys.stdout.flush()
# Random noise time sleep for experimentation purposes
......@@ -149,6 +150,8 @@ class Eddl_Compss_Distributed:
# Deserialize from disk
model = eddl.import_net_from_onnx_string(self.model)
import sys
# Set the parameters sent from master to the model
#model.setParameters(net_parametersToTensor(initial_parameters))
eddl.set_parameters(model, net_parametersToTensor(initial_parameters))
......@@ -165,24 +168,26 @@ class Eddl_Compss_Distributed:
False
)
print("Build completed in train batch task")
#print("Build completed in train batch task")
num_batches = int(num_images_per_worker / workers_batch_size)
eddl.reset_loss(model)
print("Num batches: ", num_batches)
#print("Num batches: ", num_batches)
for i in range(num_epochs_for_param_sync):
print("Epoch %d/%d (%d batches)" % (i + 1, num_epochs_for_param_sync, num_batches))
#print("Epoch %d/%d (%d batches)" % (i + 1, num_epochs_for_param_sync, num_batches))
for j in range(num_batches):
indices = list(range(j * num_batches, (j + 1) * num_batches - 1))
eddl.train_batch(model, [x_train], [y_train], indices)
eddl.print_loss(model, j)
print()
print("Train batch individual completed in train batch task")
eddl.print_loss(model, num_batches)
print()
print("Train batch individual completed in worker's train batch task")
print()
sys.stdout.flush()
# Get parameters from the model and convert them to numpy so COMPSS can serialize them
#final_parameters = net_parametersToNumpy(model.getParameters())
......@@ -224,7 +229,7 @@ class Eddl_Compss_Distributed:
False
)
print("Build completed in evaluate task")
#print("Build completed in evaluate task")
if train_test_flag == "train":
......
......@@ -49,9 +49,9 @@ def train_batch(model, x_train, y_train, num_workers, num_epochs_for_param_sync,
s = x_train.shape
num_images_per_worker = int(s[0] / num_workers)
print("Num workers: ", num_workers)
print("Num images per worker: ", num_images_per_worker)
print("Workers batch size: ", workers_batch_size)
#print("Num workers: ", num_workers)
#print("Num images per worker: ", num_images_per_worker)
#print("Workers batch size: ", workers_batch_size)
# Pickle test
......@@ -90,16 +90,15 @@ def train_batch(model, x_train, y_train, num_workers, num_epochs_for_param_sync,
# Set trained and aggregated parameters to the model
eddl.set_parameters(model, net_parametersToTensor(final_weights))
def train_batch_async(model, x_train, y_train, num_workers, num_epochs_for_param_sync, workers_batch_size):
global compss_object
s = x_train.shape
num_images_per_worker = int(s[0] / num_workers)
print("Num workers: ", num_workers)
print("Num images per worker: ", num_images_per_worker)
print("Workers batch size: ", workers_batch_size)
#print("Num workers: ", num_workers)
#print("Num images per worker: ", num_images_per_worker)
#print("Workers batch size: ", workers_batch_size)
# Array of final parameters whose initial value is initial parameters
#accumulated_parameters = net_parametersToNumpy(model.getParameters())
......@@ -126,8 +125,8 @@ def train_batch_async(model, x_train, y_train, num_workers, num_epochs_for_param
accumulated_parameters = compss_wait_on(accumulated_parameters)
# workers_parameters = compss_wait_on(workers_parameters)
print("Workers parameters: ", workers_parameters)
print("Final accumulated parameters: ", accumulated_parameters)
#print("Workers parameters: ", workers_parameters)
#print("Final accumulated parameters: ", accumulated_parameters)
# Set trained and aggregated parameters to the model
#model.setParameters(net_parametersToTensor(accumulated_parameters))
......@@ -141,9 +140,9 @@ def fit_async(model, x_train, y_train, num_workers, num_epochs_for_param_sync, m
s = x_train.shape
num_images_per_worker = int(s[0] / num_workers)
print("Num workers: ", num_workers)
print("Num images per worker: ", num_images_per_worker)
print("Workers batch size: ", workers_batch_size)
#print("Num workers: ", num_workers)
#print("Num images per worker: ", num_images_per_worker)
#print("Workers batch size: ", workers_batch_size)
# Array of final parameters whose initial value is initial parameters
#accumulated_parameters = net_parametersToNumpy(model.getParameters())
......
......@@ -69,6 +69,7 @@ def main(args):
# Model training
print("Model training...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......
......@@ -16,7 +16,7 @@ from shuffle import global_shuffle
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_mnist()
......@@ -65,16 +65,19 @@ def main(args):
y_train_dist = array(y_train, train_images_per_worker)
# Model training
print("Model training...")
print("MODEL TRAINING...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
for _ in range(0, 1):
for i in range(0, num_epochs):
print("Shuffling...")
#print("Shuffling...")
x_train_dist, y_train_dist = global_shuffle(x_train_dist, y_train_dist)
print()
print("Training epoch: ", i)
print()
compss_api.train_batch(
net,
x_train_dist,
......@@ -82,6 +85,7 @@ def main(args):
num_workers,
num_epochs_for_param_sync,
workers_batch_size)
end_time = timer()
final_time = end_time - start_time
......
......@@ -20,7 +20,7 @@ from utils import download_cifar100_npy
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#download_cifar100_npy()
......@@ -63,6 +63,7 @@ def main(args):
# Model training
print("Model training...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......@@ -104,6 +105,7 @@ def main(args):
final_time = end_time - start_time
print("Elapsed time: ", final_time, " seconds")
if __name__ == "__main__":
......
......@@ -23,7 +23,7 @@ from utils import download_cifar100_npy
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#download_cifar100_npy()
......@@ -65,7 +65,8 @@ def main(args):
x_text = apply(eddlT.div_, x_test, 255.0)
# Model training
print("Model training...")
print("MODEL TRAINING...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......@@ -73,14 +74,16 @@ def main(args):
for i in range(0, num_epochs):
epoch_start_time = timer()
print("Shuffling...")
#print("Shuffling...")
x_train, y_train = global_shuffle(x_train, y_train)
# start_epoch = num_epochs_for_param_sync * i + 1
# end_epoch = start_epoch + num_epochs_for_param_sync - 1
# print("Training epochs [", start_epoch, " - ", end_epoch, "] ...")
print()
print("Training epoch: ", i)
print()
compss_api.train_batch(
net,
x_train,
......@@ -119,6 +122,10 @@ def main(args):
print("Elapsed time: ", final_time, " seconds")
# Model evaluation
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
if __name__ == "__main__":
......
......@@ -18,7 +18,7 @@ from net_utils import net_parametersToNumpy
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_cifar10()
......@@ -64,6 +64,7 @@ def main(args):
# Model training
print("Model training...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......@@ -91,8 +92,8 @@ def main(args):
p = net_parametersToNumpy(eddl.get_parameters(net))
#print("Los final weights en main son estos: ", p)
print("Freq: ", getFreqFromParameters(p))
sys.stdout.flush()
#print("Freq: ", getFreqFromParameters(p))
#sys.stdout.flush()
#print("Param saving")
#individualParamsSave(p, "cifar10_lenet_async_ + str(max_num_async_epochs - 1) + ".txt")
......@@ -121,8 +122,8 @@ def main(args):
#eddl.save_net_to_onnx_file(net, output_file)
#individualParamsSave(p, "async" + str(max_num_async_epochs) + ".txt")
print("Evaluating model against train set")
eddl.evaluate(net, [x_train], [y_train])
#print("Evaluating model against train set")
#eddl.evaluate(net, [x_train], [y_train])
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
......
......@@ -21,7 +21,7 @@ from shuffle import global_shuffle
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_cifar10()
......@@ -66,7 +66,8 @@ def main(args):
y_train_dist = array(y_train, train_images_per_worker)
# Model training
print("Model training...")
print("MODEL TRAINING...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......@@ -75,14 +76,16 @@ def main(args):
epoch_start_time = timer()
print("Shuffling...")
#print("Shuffling...")
x_train_dist, y_train_dist = global_shuffle(x_train_dist, y_train_dist)
# start_epoch = num_epochs_for_param_sync * i + 1
# end_epoch = start_epoch + num_epochs_for_param_sync - 1
# print("Training epochs [", start_epoch, " - ", end_epoch, "] ...")
print()
print("Training epoch: ", i)
print()
compss_api.train_batch(
net,
x_train_dist,
......@@ -100,8 +103,8 @@ def main(args):
epoch_end_time = timer()
print("Elapsed time for epoch ", str(i), ": ", str(epoch_end_time - epoch_start_time), " seconds")
print("Freq: ", getFreqFromParameters(p))
sys.stdout.flush()
#print("Freq: ", getFreqFromParameters(p))
#sys.stdout.flush()
#print("Param saving")
#individualParamsSave(p, "cifar10_lenet_sync" + str(i) + ".txt")
......@@ -129,8 +132,8 @@ def main(args):
# eddl.save_net_to_onnx_file(net, output_file)
# individualParamsSave(p, "async" + str(max_num_async_epochs) + ".txt")
print("Evaluating model against train set")
eddl.evaluate(net, [x_train], [y_train])
#print("Evaluating model against train set")
#eddl.evaluate(net, [x_train], [y_train])
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
......
......@@ -17,7 +17,7 @@ from shuffle import global_shuffle
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_cifar10()
......@@ -62,36 +62,39 @@ def main(args):
y_train_dist = array(y_train, train_images_per_worker)
# Model training
print("Model training...")
print("MODEL TRAINING...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
for _ in range(0, 1):
for i in range(0, num_epochs):
print("Shuffling...")
#print("Shuffling...")
x_train_dist, y_train_dist = global_shuffle(x_train_dist, y_train_dist)
#start_epoch = num_epochs_for_param_sync * i + 1
#end_epoch = start_epoch + num_epochs_for_param_sync - 1
#print("Training epochs [", start_epoch, " - ", end_epoch, "] ...")
print()
print("Training epoch: ", i)
print()
#compss_api.train_batch(net, x_train, y_train, num_workers, num_epochs_for_param_sync, workers_batch_size)
# Model evaluation
import sys
print("Antes de: ")
sys.stdout.flush()
#print("Antes de: ")
#sys.stdout.flush()
p = eddl.get_parameters(net)
print("despues ")
sys.stdout.flush()
#print("despues ")
#sys.stdout.flush()
#p = net_parametersToNumpy(p)
#print("Freq: ", getFreqFromParameters(p))
sys.stdout.flush()
#sys.stdout.flush()
#individualParamsSave(p, "sync" + str(num_epochs) + "-epoch" + str(i) + ".txt")
......@@ -115,8 +118,8 @@ def main(args):
#eddl.save_net_to_onnx_file(net, output_file)
#individualParamsSave(p, "async" + str(max_num_async_epochs) + ".txt")
print("Evaluating model against train set")
eddl.evaluate(net, [x_train], [y_train])
#print("Evaluating model against train set")
#eddl.evaluate(net, [x_train], [y_train])
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
......
......@@ -19,7 +19,7 @@ from net_utils import net_parametersToNumpy
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_cifar10()
......@@ -64,6 +64,7 @@ def main(args):
# Model training
print("Model training...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......@@ -84,11 +85,11 @@ def main(args):
# Model evaluation
p = net_parametersToNumpy(eddl.get_parameters(net))
print("Freq: ", getFreqFromParameters(p))
#print("Freq: ", getFreqFromParameters(p))
individualParamsSave(p, "async" + str(max_num_async_epochs) + ".txt")
print("Evaluating model against train set")
eddl.evaluate(net, [x_train], [y_train])
#print("Evaluating model against train set")
#eddl.evaluate(net, [x_train], [y_train])
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
......
......@@ -17,7 +17,7 @@ from shuffle import global_shuffle
def main(args):
print("E: ", platform.uname())
#print("E: ", platform.uname())
#eddl.download_cifar10()
......@@ -61,21 +61,24 @@ def main(args):
eddlT.div_(x_test, 255.0)
# Model training
print("Model training...")
print("MODEL TRAINING...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
for _ in range(0, 1):
for i in range(0, num_epochs):
print("Shuffling...")
#print("Shuffling...")
x_train_dist, y_train_dist = global_shuffle(x_train_dist, y_train_dist)
#start_epoch = num_epochs_for_param_sync * i + 1
#end_epoch = start_epoch + num_epochs_for_param_sync - 1
#print("Training epochs [", start_epoch, " - ", end_epoch, "] ...")
print()
print("Training epoch: ", i)
print()
compss_api.train_batch(
net,
x_train_dist,
......@@ -87,16 +90,16 @@ def main(args):
# Model evaluation
import sys
print("Antes de: ")
sys.stdout.flush()
#print("Antes de: ")
#sys.stdout.flush()
p = eddl.get_parameters(net)
print("despues ")
sys.stdout.flush()
#print("despues ")
#sys.stdout.flush()
#p = net_parametersToNumpy(p)
#print("Freq: ", getFreqFromParameters(p))
sys.stdout.flush()
#sys.stdout.flush()
#individualParamsSave(p, "sync" + str(num_epochs) + "-epoch" + str(i) + ".txt")
......@@ -116,8 +119,8 @@ def main(args):
#print("Freq: ", getFreqFromParameters(p))
#individualParamsSave(p, "async" + str(max_num_async_epochs) + ".txt")
print("Evaluating model against train set")
eddl.evaluate(net, [x_train], [y_train])
#print("Evaluating model against train set")
#eddl.evaluate(net, [x_train], [y_train])
print("Evaluating model against test set")
eddl.evaluate(net, [x_test], [y_test])
......
......@@ -45,7 +45,7 @@ def main(args):
eddl.sgd(CVAR_SGD1, CVAR_SGD2),
["soft_cross_entropy"],
["categorical_accuracy"],
eddl.CS_CPU(),
eddl.CS_GPU(),
True
)
......@@ -69,6 +69,7 @@ def main(args):
# Model training
print("Model training...")
print("Num workers: ", num_workers)
print("Number of epochs: ", num_epochs)
print("Number of epochs for parameter syncronization: ", num_epochs_for_param_sync)
......
......@@ -16,7 +16,7 @@ from shuffle import global_shuffle