Commit 3b49c275 authored by flordan's avatar flordan
Browse files

Merge branch '82-add-timeout-to-agents-test' into 'trunk'

Resolve "Add timeout to agents test"

Closes #82

See merge request wdc/compss/framework!183
parents eba0bf65 f7e10b06
......@@ -37,6 +37,7 @@ NUM_CHECK_RETRIES="10"
agentStartCommand="${AGENTS_SCRIPTS_HOME}compss_agent_start"
callOperationCommand="${AGENTS_SCRIPTS_HOME}compss_agent_call_operation"
pidsAllAgents=""
......@@ -208,6 +209,9 @@ get_args() {
cei=*)
add_param_to_call_operation "--$OPTARG"
;;
exec_time=*)
exec_time=${OPTARG//exec_time=/}
;;
lang=*)
lang=${OPTARG//lang=/}
add_param_to_call_operation "--lang=${lang}"
......@@ -381,10 +385,21 @@ specific_agent_start_command="${agentStartCommand}
if [ -z "${pidAgent1}" ]; then
pidAgent1=${cmd_pid}
fi
pidsAllAgents="${pidsAllAgents} ${cmd_pid}"
}
###############################################
# Kill all agents
###############################################
kill_agents() {
display_error "Killing all agents"
kill -9 ${pidsAllAgents}
compss_clean_procs # not ideal but killing only the agents left a bunch of processes of the runtime/executors/bindings
exit 1
}
###############################################
# Check Agent Local Resources
###############################################
......@@ -432,6 +447,7 @@ check_local_resources() {
fi
}
###############################################
# Check Agent started properly
###############################################
......@@ -567,10 +583,10 @@ check_topology() {
if [[ ${cpus_topology} -eq ${cpus_num_agents} ]]; then
if [ ${verbose_level} -gt 0 ]; then
display_success "${GREEN}Topology created successfully${NC}"
display_success "Topology created successfully"
fi
else
fatal_error "${RED}Error creating topology${NC}" 1
fatal_error "Error creating topology" 1
fi
}
......@@ -611,6 +627,21 @@ call_operation() {
add_param_to_call_operation "${executable} ${execution_params}"
print_and_run_cmd "${callOperationCommand}" "${output_log}" "${error_log}"
if [ ! -z ${exec_time} ]; then
if [ ${exec_time: -1} == "s" ]; then
exec_time=${exec_time::-1}
fi
if [ ${exec_time: -1} == "m" ]; then
exec_time=$(( 60*${exec_time::-1} ))
fi
if [ ${exec_time: -1} == "h" ]; then
exec_time=$(( 3600*${exec_time::-1} ))
fi
#timeout process for the agents ("timeout" command doesn't allow for a shell command such as "wait")
(sleep ${exec_time} && kill_agents) 1>/dev/null 2>/dev/null &
timeoutPID=$!
fi
# check execution start
retries=${NUM_CHECK_RETRIES}
while [ ! -f "${master_log}/jobs/job1_NEW.out" ]; do
......@@ -629,14 +660,23 @@ call_operation() {
if [ ${verbose_level} -gt 1 ]; then
display_info "waiting for agent1 with pid: $pidAgent1"
fi
wait $pidAgent1
wait ${pidsAllAgents} 1>/dev/null 2>/dev/null
kill -9 ${timeoutPID} 1>/dev/null 2>/dev/null
exit_kill_timeout=$?
wait ${timeoutPID} 1>/dev/null 2>/dev/null #this supresses the output of the previous kill
# if the kill of the timeout process fails it means the timeout process ended and that the agents were killed by it
if [ "${exit_kill_timeout}" == "1" ]; then
fatal_error "At least one agent process has not yet finished its work after ${exec_time} seconds." 124
fi
if grep -q "Job completed after" "${master_log}.outputlog"; then
if [ ${verbose_level} -gt 0 ]; then
display_success "Execution ended succesfully"
fi
else
fatal_error "${RED}Execution failed${NC}" 1
fatal_error "Execution failed." 1
fi
}
......
......@@ -16,6 +16,7 @@ exit_value=0
NUM_RETRIES="50"
jar_name="0.2_1_agent_stop.jar"
app_name="0.2_1_agent_stop"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -72,17 +73,21 @@ echo "Agent started" > >(tee -a "${output_log}")
# STOPING AGENT WITH CURL AND CHECKING IF IT WORKED
curl -XDELETE http://127.0.0.1:46101/COMPSs 1>/dev/null 2>/dev/null
CONFIRMATION=""
while [ -z "${CONFIRMATION}" ]; do
CONFIRMATION=$(grep "Agent was shutdown" "${test1_log_dir}/COMPSsWorker01/agent.log")
retries="${NUM_RETRIES}"
while [ -z "${CONFIRMATION}" ] && [ "${retries}" -gt "0" ]; do
sleep 1
CONFIRMATION=$(grep "Agent was shutdown" "${test1_log_dir}/COMPSsWorker01/agent.log")
done
if [ -z "${CONFIRMATION}" ]; then
echo "Agent failed to stop with curl" > >(tee -a "${error_log}")
exit 1
fi
echo "The agent properly shuts down with curl" > >(tee -a "${output_log}")
AGENT_PIDS=""
# STARTING AGENT TO STOP IT WITH A --stop OPERATION CALL
agent2_log_dir="${log_dir}/test2"
compss_agent_start_service \
......@@ -90,6 +95,7 @@ compss_agent_start_service \
--classpath="${base_app_dir}/${jar_name}" \
--log_dir=${agent2_log_dir} \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......@@ -97,11 +103,11 @@ compss_agent_start_service \
sleep 1
pid_info=""
CONFIRMATION=""
while [ -z "${CONFIRMATION}" ]; do
sleep 1
CONFIRMATION=$(grep "Agent was shutdown" "${agent2_log_dir}/COMPSsWorker01/agent.log")
done
CONFIRMATION=$(grep "Agent was shutdown" "${test1_log_dir}/COMPSsWorker01/agent.log")
if [ -z "${CONFIRMATION}" ]; then
echo "Agent failed to stop with --stop operation" > >(tee -a "${error_log}")
exit 1
fi
echo "The agent properly shuts down calling operation with flag --stop" > >(tee -a "${output_log}")
AGENT_PIDS=""
......@@ -114,6 +120,7 @@ compss_agent_start_service \
--classpath="${base_app_dir}/${jar_name}" \
--log_dir="${test3_log_dir}" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......@@ -142,17 +149,18 @@ compss_agent_start_service \
--classpath="${base_app_dir}/${jar_name}" \
--log_dir=${test4_log_dir} \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
es.bsc.compss.test.DemoClassError 1
sleep 1
CONFIRMATION=""
while [ -z "${CONFIRMATION}" ]; do
sleep 1
CONFIRMATION=$(grep "Agent was shutdown" "${test4_log_dir}/COMPSsWorker01/agent.log")
done
CONFIRMATION=$(grep "Agent was shutdown" "${test4_log_dir}/COMPSsWorker01/agent.log")
if [ -z "${CONFIRMATION}" ]; then
echo "Failed to stop agent with a failure on the execution of compss_agent_call_operation." > >(tee -a "${error_log}")
exit 1
fi
echo "The agent properly shuts when an error occurs on the execution" > >(tee -a "${output_log}")
AGENT_PIDS=""
......
......@@ -16,6 +16,7 @@
NUM_RETRIES="50"
jar_name="4.0_1_agent_nested.jar"
app_name="4.0_1_agent_nested"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -88,17 +89,13 @@
--log_dir="${agent1_log_dir}" \
--reuse_resources_on_block="false" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
--cei="recursive.CountItf" \
"recursive.Count" 3
retries="3"
while [ ! -f "${agent1_log_dir}/COMPSsWorker01/jobs/job1_NEW.out" ] && [ "${retries}" -gt "0" ]; do
sleep 2s
retries=$((retries - 1 ))
done
if [ ! -f "${agent1_log_dir}/COMPSsWorker01/jobs/job1_NEW.out" ]; then
echo "Could not invoke recursive.Count's main method." > >(tee -a "${error_log}")
exit 1
......
......@@ -16,6 +16,7 @@
NUM_RETRIES="50"
jar_name="4.1_1_agent_nested_blocks.jar"
app_name="4.1_1_agent_nested_blocks"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -87,6 +88,7 @@
--classpath="${base_app_dir}/${jar_name}" \
--log_dir="${agent1_log_dir}" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -16,6 +16,7 @@
NUM_RETRIES="50"
jar_name="4.1_1_agent_nested_files.jar"
app_name="4.1_1_agent_nested_files"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -92,6 +93,7 @@
--log_dir="${agent_log_dir}" \
--reuse_resources_on_block"=false" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -16,6 +16,7 @@
NUM_RETRIES="50"
jar_name="5.0_4_agent_nested.jar"
app_name="5.0_4_agent_nested"
expected_time="60s"
# Traps and Handlers
function kill_agents() {
......@@ -81,6 +82,7 @@
--classpath="${base_app_dir}/${jar_name}" \
--log_dir=${log_dir} \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--reuse_resources_on_block=false \
--project="${base_app_dir}/project.xml" \
......@@ -88,7 +90,7 @@
--topology=chain \
--cei="recursive.CountItf" \
"recursive.Count" 3
run_pid=$!
agent1_log_dir="${log_dir}/COMPSsWorker01/"
agent2_log_dir="${log_dir}/COMPSsWorker02/"
......
......@@ -13,6 +13,7 @@
# Global variables
AGENT_PIDS=""
exit_value=0
expected_time="60"
NUM_RETRIES="50"
jar_name="5.1_3_agents_nested_files.jar"
app_name="5.1_3_agents_nested_files"
......@@ -20,7 +21,7 @@
# Traps and Handlers
function kill_agents() {
for pid in ${AGENT_PIDS}; do
kill -SIGINT ${pid}
kill -SIGINT ${pid} 2>/dev/null
done
}
trap kill_agents EXIT
......@@ -105,8 +106,6 @@
for agent_id in $(seq 1 3); do
agent_port=46${agent_id}01
out_log_name=agent${agent_id}_output_log
eval agent_output_log=${!out_log_name}
retries="${NUM_RETRIES}"
curl -XGET http://127.0.0.1:${agent_port}/COMPSs/test 1>/dev/null 2>/dev/null
......@@ -117,7 +116,11 @@
curl -XGET http://127.0.0.1:${agent_port}/COMPSs/test 1>/dev/null 2>/dev/null
ev=$?
done
done
sleep 1
for agent_id in $(seq 1 3); do
out_log_name=agent${agent_id}_output_log
eval agent_output_log=${!out_log_name}
RESULT=$(grep "test invoked" "${agent_output_log}")
if [ -z "${RESULT}" ]; then
......@@ -125,9 +128,8 @@
exit 1
fi
echo "Agent started" > >(tee -a "${output_log}")
sleep 2s
done
sleep 2
for agent_id in $(seq 1 2); do
agent_port=46${agent_id}01
......@@ -139,16 +141,19 @@
--master_node="127.0.0.1" \
--master_port="46101" \
--cei=files.MainItf \
--stop \
--forward_to="COMPSsWorker02:46201;COMPSsWorker03:46301" \
files.Main > >(tee -a "${output_log}") 2> >(tee -a "${error_log}")
ev=$?
sleep 5s
if [ "$ev" != "0" ]; then
echo "Could not invoke recursive.Count's main method." > >(tee -a "${error_log}")
exit $ev
fi
echo "recursive.Count's main function invoked" > >(tee -a "${output_log}")
sleep 5
retries="3"
while [ ! -f "${agent1_log_dir}/jobs/job1_NEW.out" ] && [ "${retries}" -gt "0" ]; do
sleep 2s
......@@ -169,7 +174,17 @@
fi
echo "recursive.Count's main method properly started" > >(tee -a "${output_log}")
sleep 20s
echo "Waiting for Agent processes (PIDs: ${AGENT_PIDS}) to end" > >(tee -a "${output_log}")
(sleep ${expected_time} && kill_agents) 1>/dev/null 2>/dev/null &
timeoutPID=$!
wait ${AGENT_PIDS} 1>/dev/null 2>/dev/null
kill -9 ${timeoutPID} 1>/dev/null 2>/dev/null
exit_kill_timeout=$?
wait ${timeoutPID} 1>/dev/null 2>/dev/null
if [ "${exit_kill_timeout}" == "1" ]; then
echo "At least one agent process has not yet finished its work after ${expected_time} seconds." > >(tee -a "${error_log}")
exit 124
fi
created_jobs=$(ls "${agent1_log_dir}jobs" | grep -c NEW.out)
if [ ! "${created_jobs}" == "1" ]; then
......
......@@ -16,6 +16,7 @@
NUM_RETRIES="50"
jar_name="5.1_4_agent_nested_inout_object.jar"
app_name="5.1_4_agent_nested_inout_object"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -86,6 +87,7 @@
--log_dir="${log_dir}" \
--reuse_resources_on_block="false" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -15,6 +15,7 @@
exit_value=0
NUM_RETRIES="50"
app_name="5.2_1_agent_blocks_python"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -84,6 +85,7 @@
--pythonpath="${base_app_dir}/src" \
--log_dir="${log_dir}" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -15,6 +15,7 @@
exit_value=0
NUM_RETRIES="50"
app_name="5.3_1_agents_nested_blocks_python"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -84,6 +85,7 @@ compss_agent_start_service \
--rest_port="46101" \
--comm_port="46102" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -15,6 +15,7 @@
exit_value=0
NUM_RETRIES="50"
app_name="5.4_1_agents_python_collections"
expectedTime="60s"
# Traps and Handlers
function kill_agents() {
......@@ -50,6 +51,7 @@ compss_agent_start_service \
--pythonpath="${base_app_dir}/src" \
--log_dir="${log_dir}" \
-d \
--exec_time=${expectedTime} \
${runcompss_opts} \
--project="${base_app_dir}/project.xml" \
--resources="${base_app_dir}/resources.xml" \
......
......@@ -15,6 +15,7 @@ AGENT_PIDS=""
exit_value=0
NUM_RETRIES="50"
app_name="5.4_1_collection_passing"
expected_time="60s"
# Traps and Handlers
function kill_agents() {
......@@ -193,8 +194,18 @@ echo "Calling operation on Agent 1" > >(tee -a "${output_log}")
"collection_passing" > >(tee -a "${output_log}") 2> >(tee -a "${error_log}")
ev=$?
echo "Waiting for Agent 1 to finish" > >(tee -a "${output_log}")
wait ${agent1_pid}
(sleep ${expected_time} && kill_agents) 1>/dev/null 2>/dev/null &
timeoutPID=$!
wait ${agent1_pid} 1>/dev/null 2>/dev/null
kill -9 ${timeoutPID} 1>/dev/null 2>/dev/null
exit_kill_timeout=$?
wait ${timeoutPID} 1>/dev/null 2>/dev/null
if [ "${exit_kill_timeout}" == "1" ]; then
echo "At least one agent process has not yet finished its work after ${expected_time} seconds." > >(tee -a "${error_log}")
exit 124
fi
echo "Call operation ended, agents shut down"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment