Commit c8a5bf8b authored by Gaurav Saxena's avatar Gaurav Saxena
Browse files

BioFVM with X decomposition - obtaining much better timings than the Z...

BioFVM with X decomposition - obtaining much better timings than the Z decomposition. Also remember to use the ppr notation in the script file. Best results being obtained from 2 processes per node i.e. one per NUMA region and then nodes can be increased.
parent 975859a6
......@@ -778,7 +778,7 @@ void Microenvironment::simulate_bulk_sources_and_sinks( double dt )
void Microenvironment::simulate_cell_sources_and_sinks( std::vector<Basic_Agent*>& basic_agent_list , double dt )
{
//#pragma omp parallel for
#pragma omp parallel for
for( int i=0 ; i < basic_agent_list.size() ; i++ )
{
basic_agent_list[i]->simulate_secretion_and_uptake( this , dt );
......
......@@ -722,9 +722,15 @@ void Microenvironment::write_to_matlab( std::string filename, int rank, int size
return;
}
/*------------------------------Implementing the X-decomposition-------------------------------------*/
void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M, double dt, int size, int rank, int *coords, int *dims, MPI_Comm mpi_Cart_comm )
{
MPI_Request send_req, recv_req;
MPI_Request send_req, recv_req;
double t_strt_set, t_end_set;
double t_strt_x, t_end_x;
double t_strt_y,t_end_y;
double t_strt_z,t_end_z;
if( M.mesh.uniform_mesh == false || M.mesh.Cartesian_mesh == false )
......@@ -737,14 +743,17 @@ void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M,
if( !M.diffusion_solver_setup_done )
{
std::cout << std::endl << "Using method " << __FUNCTION__ << " (implicit 3-D LOD with Thomas Algorithm) ... "
<< std::endl << std::endl;
/*-------------------------------------------------------------*/
/* x_coordinates */
/* are of size local_x_nodes (see function resize() */
/* of class Cartesian Mesh in BioFVM_parallel.cpp. */
/* Each line of Voxels going */
/* from left to right forms a tridiagonal system of Equations */
//t_strt_set = MPI_Wtime();
//std::cout << std::endl << "Using method " << __FUNCTION__ << " (implicit 3-D LOD with Thomas Algorithm) ... "
//<< std::endl << std::endl;
/*-------------------------------------------------------------*/
/* x_coordinates are of size local_x_nodes */
/* (see function resize() of class Cartesian Mesh in */
/* BioFVM_parallel.cpp. */
/* Each line of Voxels going from left to right forms */
/* a tridiagonal system of Equations */
/* Now these lines are going to split in the X decomposition */
/*-------------------------------------------------------------*/
M.thomas_denomx.resize( M.mesh.x_coordinates.size() , M.zero ); //sizeof(x_coordinates) = local_x_nodes, denomx is the main diagonal elements
......@@ -763,21 +772,16 @@ void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M,
/* z_coordinates are of size of local_z_nodes. */
/* Each line of Voxels going */
/* from front to back forms a tridiagonal system of Equations */
/* Also in 1-D Z decomposition, these lines */
/* are going to be split over multiple processes. */
/*-------------------------------------------------------------*/
M.thomas_denomz.resize( M.mesh.z_coordinates.size() , M.zero );
M.thomas_cz.resize( M.mesh.z_coordinates.size() , M.zero );
/*-------------------------------------------------------------*/
/* For x, y direction, AND 1-D decomposition in the Z-dir */
/* the thomas_i_jump and thomas_j_jump will always be within */
/* the process but for thomas_k_jump, this will be within */
/* process for non-boundary voxels and in neighbouring */
/* processes for boundary voxels. */
/* Hence we can safely use thomas_i_jump, thomas_j_jump */
/* but NOT thomas_k_jump */
/* For X-decomposition thomas_i_jump - 1 can be in the previous*/
/* process and thomas_i_jump+1 can be in the next processs */
/* hence we can use thomas_j_jump and thomas_k_jump safely */
/* but we CANNOT use thomas_i_jump safely */
/*-------------------------------------------------------------*/
M.thomas_i_jump = 1;
......@@ -815,117 +819,32 @@ void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M,
// Thomas solver coefficients
/*--------------------------------------------------------------------*/
/* In 1-D Z decomposition, x and y-lines are contiguous adn typically */
/* the assignments below for x,y should not be changed */
/* Both the first voxel i.e. index 0 and last voxel i.e. index= */
/* x_coordinates.size()-1 are on the same process */
/* In 1-D X decomposition, y and z-lines are contiguous and typically */
/* the assignments below for y,z should not be changed */
/*--------------------------------------------------------------------*/
M.thomas_cx.assign( M.mesh.x_coordinates.size() , M.thomas_constant1a ); //Fill b and c elements with -D * dt/dx^2
M.thomas_denomx.assign( M.mesh.x_coordinates.size() , M.thomas_constant3 ); //Fill diagonal elements with (1 + 1/3 * lambda * dt + 2*D*dt/dx^2)
M.thomas_denomx[0] = M.thomas_constant3a; //First diagonal element is (1 + 1/3 * lambda * dt + 1*D*dt/dx^2)
M.thomas_denomx[ M.mesh.x_coordinates.size()-1 ] = M.thomas_constant3a; //Last diagonal element is (1 + 1/3 * lambda * dt + 1*D*dt/dx^2)
if( M.mesh.x_coordinates.size() == 1 ) //This is an extreme case, won't exist
{
M.thomas_denomx[0] = M.one;
M.thomas_denomx[0] += M.thomas_constant2;
}
M.thomas_cx[0] /= M.thomas_denomx[0]; //The first c element of tridiagonal matrix is div by first diagonal el.
if(rank == 0)
M.thomas_denomx[0] = M.thomas_constant3a; //First diagonal element is (1 + 1/3 * lambda * dt + 1*D*dt/dx^2)
if(rank == (size-1))
M.thomas_denomx[ M.mesh.x_coordinates.size()-1 ] = M.thomas_constant3a; //Last diagonal element is (1 + 1/3 * lambda * dt + 1*D*dt/dx^2)
for( int i=1 ; i <= M.mesh.x_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomx[i] , M.thomas_constant1 , M.thomas_cx[i-1] ); //axpy(1st, 2nd, 3rd) => 1st = 1st + 2nd * 3rd
M.thomas_cx[i] /= M.thomas_denomx[i]; //the value at size-1 is not actually used
} //Since value of size-1 is not used, it means it is the value after the last
//Diagonal element
/*--------------------------------------------------------------------*/
/* In 1-D Z decomposition, x and y-lines are contiguous adn typically */
/* the assignments below for x,y should not be changed */
/* Both the first voxel i.e. index 0 and last voxel i.e. index= */
/* y_coordinates.size()-1 are on the same process */
/*--------------------------------------------------------------------*/
if(rank == 0)
if( M.mesh.x_coordinates.size() == 1 ) //This is an extreme case, won't exist, still if it does
{ //then this must be at rank 0
M.thomas_denomx[0] = M.one;
M.thomas_denomx[0] += M.thomas_constant2;
}
if(rank == 0)
M.thomas_cx[0] /= M.thomas_denomx[0]; //The first c element of tridiagonal matrix is div by first diagonal el.
M.thomas_cy.assign( M.mesh.y_coordinates.size() , M.thomas_constant1a );
M.thomas_denomy.assign( M.mesh.y_coordinates.size() , M.thomas_constant3 );
M.thomas_denomy[0] = M.thomas_constant3a;
M.thomas_denomy[ M.mesh.y_coordinates.size()-1 ] = M.thomas_constant3a;
if( M.mesh.y_coordinates.size() == 1 )
{ M.thomas_denomy[0] = M.one; M.thomas_denomy[0] += M.thomas_constant2; }
M.thomas_cy[0] /= M.thomas_denomy[0];
for( int i=1 ; i <= M.mesh.y_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomy[i] , M.thomas_constant1 , M.thomas_cy[i-1] );
M.thomas_cy[i] /= M.thomas_denomy[i]; // the value at size-1 is not actually used
}
/*--------------------------------------------------------------------*/
/* for nProcs > 1 and 1-D decomposition in Z-direction, index 0 and */
/* index z_coordinates.size()-1, the latter being the ACTUAL boundary */
/* point of the domain, are on different processes */
/*--------------------------------------------------------------------*/
M.thomas_cz.assign( M.mesh.z_coordinates.size() , M.thomas_constant1a );
M.thomas_denomz.assign( M.mesh.z_coordinates.size() , M.thomas_constant3 );
/*--------------------------------------------------------------------*/
/* This is tricky: */
/* Now processes in Z direction will ALL have their individual */
/* thomas_denomz[] and thomas_cz[]. */
/* Only processes which contain the front face of the domain */
/* will set thomas_denomz[0]=thomas_constant3a */
/* These processes have mpi_Coord[2]=0 */
/* Also ONLY processes containing the back boundary of domain will */
/* set thomas_denomz[M.mesh.z_coordinates.size()-1]=thomas_constant3a */
/* Such processes will have mpi_Coord[2]=dims[2]-1 */
/* In 1-D Z decomposition, rank 0 sets thomas_denomz and only */
/* rank=size-1, sets z_coordinates.size()-1 */
/*--------------------------------------------------------------------*/
if(coords[2] == 0) //Or this can be if(rank == 0)
M.thomas_denomz[0] = M.thomas_constant3a;
if(coords[2] == dims[2]-1) //Or this can be if(rank == (size-1))
M.thomas_denomz[ M.mesh.z_coordinates.size()-1 ] = M.thomas_constant3a;
if( M.mesh.z_coordinates.size() == 1 ) //Extreme case - unlikely to occur
{
M.thomas_denomz[0] = M.one;
M.thomas_denomz[0] += M.thomas_constant2;
}
/*-------------------------------------------------------------------*/
/* This again is to be done on the processes containing the front */
/* boundary i.e. ONLY for mpi_Coord[2]=0; */
/*-------------------------------------------------------------------*/
if(coords[2] == 0) //Or this can be if(rank == 0)
M.thomas_cz[0] /= M.thomas_denomz[0];
/*-------------------------------------------------------------------*/
/* This will be serialized because thomas_cz[i-1] is needed which */
/* modified in every iteration of for loop. */
/* For rank 0 process in 1-D Z decomposition, calculation will */
/* begin at i=1 but for other processes, calculation will begin at */
/* i=0 AND we need to pass the value of last thomas_cz[i-1] in */
/* in every process to next process. */
/* I think this will just be a single double value. */
/*-------------------------------------------------------------------*/
/*-------------------------------------------------------------------*/
/* The logic is like this: */
/* (1) First calculate M.thomas_cz[0] on rank=0 (see above) */
/* (2) when ser_ctr=0, calculate 1 to z_coords.size()-1 for rank=0 */
/* (3) MPI_Isend(last thomas_cz[]) in rank 0 to rank 1 */
/* (4) In rank=1, MPI_Irecv() this value and calculate 0th element */
/* (5) ser_ctr increases and now calculate remaining values in rank=1*/
/* (6) Send last value to next process and continue like this */
/*-------------------------------------------------------------------*/
//axpy(1st, 2nd, 3rd) => 1st = 1st + 2nd * 3rd
//the value at size-1 is not actually used
//Since value of size-1 is not used, it means it is the value after the last Diagonal element
for(int ser_ctr=0; ser_ctr<=size-1; ser_ctr++)
{
if(rank == ser_ctr)
......@@ -933,24 +852,24 @@ void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M,
if(rank == 0 && rank <= size-1) //If size=1, then this process does not send data
{
for( int i=1 ; i <= M.mesh.z_coordinates.size()-1 ; i++ )
for( int i=1 ; i <= M.mesh.x_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomz[i] , M.thomas_constant1 , M.thomas_cz[i-1] );
M.thomas_cz[i] /= M.thomas_denomz[i]; // the value at size-1 is not actually used
axpy( &M.thomas_denomx[i] , M.thomas_constant1 , M.thomas_cx[i-1] );
M.thomas_cx[i] /= M.thomas_denomx[i]; // the value at size-1 is not actually used
}
}
else
{
for( int i=1 ; i <= M.mesh.z_coordinates.size()-1 ; i++ )
for( int i=1 ; i <= M.mesh.x_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomz[i] , M.thomas_constant1 , M.thomas_cz[i-1] );
M.thomas_cz[i] /= M.thomas_denomz[i]; // the value at size-1 is not actually used
axpy( &M.thomas_denomx[i] , M.thomas_constant1 , M.thomas_cx[i-1] );
M.thomas_cx[i] /= M.thomas_denomx[i]; // the value at size-1 is not actually used
}
}
if(rank < (size-1))
{
MPI_Isend(&(M.thomas_cz[M.mesh.z_coordinates.size()-1][0]), M.thomas_cz[M.mesh.z_coordinates.size()-1].size(), MPI_DOUBLE, ser_ctr+1, 1111,mpi_Cart_comm, &send_req);
MPI_Isend(&(M.thomas_cx[M.mesh.x_coordinates.size()-1][0]), M.thomas_cx[M.mesh.x_coordinates.size()-1].size(), MPI_DOUBLE, ser_ctr+1, 1111,mpi_Cart_comm, &send_req);
MPI_Wait(&send_req,MPI_STATUS_IGNORE);
}
}
......@@ -958,251 +877,155 @@ void diffusion_decay_solver__constant_coefficients_LOD_3D( Microenvironment& M,
if(rank == (ser_ctr+1) && (ser_ctr+1) <= (size-1))
{
std::vector<double> temp_cz(M.thomas_cz[0].size());
std::vector<double> temp_cx(M.thomas_cx[0].size());
MPI_Irecv(&temp_cz[0], temp_cz.size(), MPI_DOUBLE, ser_ctr, 1111, mpi_Cart_comm, &recv_req);
MPI_Irecv(&temp_cx[0], temp_cx.size(), MPI_DOUBLE, ser_ctr, 1111, mpi_Cart_comm, &recv_req);
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
axpy( &M.thomas_denomz[0] , M.thomas_constant1 , temp_cz ); //CHECK IF &temp_cz[0] is OK, axpy() in BioFVM_vector.cpp
M.thomas_cz[0] /= M.thomas_denomz[0]; // the value at size-1 is not actually used
axpy( &M.thomas_denomx[0] , M.thomas_constant1 , temp_cx ); //CHECK IF &temp_cz[0] is OK, axpy() in BioFVM_vector.cpp
M.thomas_cx[0] /= M.thomas_denomx[0]; // the value at size-1 is not actually used
}
MPI_Barrier(mpi_Cart_comm);
}
/*--------------------------------------------------------------------*/
/* In 1-D X decomposition, z and y-lines are contiguous adn typically */
/* the assignments below for z,y should not be changed */
/* Both the first voxel i.e. index 0 and last voxel i.e. index= */
/* y_coordinates.size()-1 are on the same process */
/*--------------------------------------------------------------------*/
M.thomas_cy.assign( M.mesh.y_coordinates.size() , M.thomas_constant1a );
M.thomas_denomy.assign( M.mesh.y_coordinates.size() , M.thomas_constant3 );
M.thomas_denomy[0] = M.thomas_constant3a;
M.thomas_denomy[ M.mesh.y_coordinates.size()-1 ] = M.thomas_constant3a;
if( M.mesh.y_coordinates.size() == 1 )
{
M.thomas_denomy[0] = M.one;
M.thomas_denomy[0] += M.thomas_constant2;
}
M.thomas_cy[0] /= M.thomas_denomy[0];
for( int i=1 ; i <= M.mesh.y_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomy[i] , M.thomas_constant1 , M.thomas_cy[i-1] );
M.thomas_cy[i] /= M.thomas_denomy[i]; // the value at size-1 is not actually used
}
M.thomas_cz.assign( M.mesh.z_coordinates.size() , M.thomas_constant1a );
M.thomas_denomz.assign( M.mesh.z_coordinates.size() , M.thomas_constant3 );
M.thomas_denomz[0] = M.thomas_constant3a;
M.thomas_denomz[ M.mesh.z_coordinates.size()-1 ] = M.thomas_constant3a;
if( M.mesh.z_coordinates.size() == 1 )
{
M.thomas_denomz[0] = M.one;
M.thomas_denomz[0] += M.thomas_constant2;
}
M.thomas_cz[0] /= M.thomas_denomz[0];
for( int i=1 ; i <= M.mesh.z_coordinates.size()-1 ; i++ )
{
axpy( &M.thomas_denomz[i] , M.thomas_constant1 , M.thomas_cz[i-1] );
M.thomas_cz[i] /= M.thomas_denomz[i]; // the value at size-1 is not actually used
}
M.diffusion_solver_setup_done = true;
//t_end_set = MPI_Wtime();
//std::cout<<"Set-up time = "<<(t_end_set-t_strt_set)<<std::endl;
}
// x-diffusion
M.apply_dirichlet_conditions();
/*-----------------------------------------------------------------------*/
/* For X/Y directions, I don't think any change is needed */
/*-----------------------------------------------------------------------*/
#pragma omp parallel for
for( int k=0; k < M.mesh.z_coordinates.size() ; k++ )
{
for( int j=0; j < M.mesh.y_coordinates.size() ; j++ )
{
// Thomas solver, x-direction
// remaining part of forward elimination, using pre-computed quantities
int n = M.voxel_index(0,j,k);
(*M.p_density_vectors)[n] /= M.thomas_denomx[0];
for( int i=1; i < M.mesh.x_coordinates.size() ; i++ )
{
n = M.voxel_index(i,j,k);
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , (*M.p_density_vectors)[n-M.thomas_i_jump] );
(*M.p_density_vectors)[n] /= M.thomas_denomx[i];
}
for( int i = M.mesh.x_coordinates.size()-2 ; i >= 0 ; i-- )
{
n = M.voxel_index(i,j,k);
naxpy( &(*M.p_density_vectors)[n] , M.thomas_cx[i] , (*M.p_density_vectors)[n+M.thomas_i_jump] );
}
}
}
// y-diffusion
M.apply_dirichlet_conditions();
#pragma omp parallel for
for( int k=0; k < M.mesh.z_coordinates.size() ; k++ )
{
for( int i=0; i < M.mesh.x_coordinates.size() ; i++ )
{
// Thomas solver, y-direction
// remaining part of forward elimination, using pre-computed quantities
int n = M.voxel_index(i,0,k);
(*M.p_density_vectors)[n] /= M.thomas_denomy[0];
for( int j=1; j < M.mesh.y_coordinates.size() ; j++ )
{
n = M.voxel_index(i,j,k);
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , (*M.p_density_vectors)[n-M.thomas_j_jump] );
(*M.p_density_vectors)[n] /= M.thomas_denomy[j];
}
// back substitution
// n = voxel_index( mesh.x_coordinates.size()-2 ,j,k);
for( int j = M.mesh.y_coordinates.size()-2 ; j >= 0 ; j-- )
{
n = M.voxel_index(i,j,k);
naxpy( &(*M.p_density_vectors)[n] , M.thomas_cy[j] , (*M.p_density_vectors)[n+M.thomas_j_jump] );
}
}
}
// z-diffusion
/*--------------------------------------------------------------------------------*/
/* This will change. Why is the Dirichlet condition being applied again and again?*/
/* This is where serialization will begin, I think after the apply_dirichlet() */
/* */
/*--------------------------------------------------------------------------------*/
M.apply_dirichlet_conditions();
/*------------------------------------------------------------------------------*/
/* PROCESSING OF 0TH ELEMENT IN ARRAY */
/* The processing on the 0th element of array will be done */
/* only on rank = 0, or basically the processes containing */
/* the front boundary of the domain. */
/* MPI divides domain into thick slices in Z-direction */
/* OpenMP divides the thick slices in vertical direction i.e. horizontal lines. */
/*------------------------------------------------------------------------------*/
/*------------------------------------------------------------------------------*/
/* FORWARD SUBSITUTION AND DATA SENDING OF BACK PLANE */
/* Let all the processing finish at rank 0, now we need to send the 'back' */
/* square of p_density_vectors[] to rank 1. Do the same at rank 2 so on... */
/* Possibly the thomas_k_jump will also change */
/* Instead of using thomas_k_jump, we can use prev=voxel_index(i,j,k-1) */
/* and then use it in (*M.p_density_vectors)[prev] */
/* Also for rank > 0, it needs to start at k=0 */
/* The values of p_density_vectors sent from the previous neighbour will be */
/* stored in temp_p_density_vectors -- only used for k=1 else just use */
/* p_density_vectors. Vector of vectors is not stored contiguously */
/* So we need to pack them into a single array and then send it to other end */
/*------------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------------*/
/* BACK SUBSTITUTION AND DATA SENDING OF FRONT PLANE */
/* This starts from the last process i.e. rank = size-1 then proceeds backwards */
/* i.e. from back boundary to front boundary. On the last process k starts from */
/* z_coordinates.size()-2 but on other processes, it starts from */
/* z_coordinates.size()-1. */
/*---------------------------------------------------------------------------------*/
/*------------------COMMENT OUT FROM HERE-----------------------------------------*/
/* So the code is like M.apply_dirichlet_conditions() then code for Z-diffusion */
/* Then M.apply_dirichlet_conditions(); */
/*--------------------------------------------------------------------------------*/
// #pragma omp parallel for
// for( int j=0; j < M.mesh.y_coordinates.size() ; j++ )
// {
//
// for( int i=0; i < M.mesh.x_coordinates.size() ; i++ )
// {
//
// // remaining part of forward elimination, using pre-computed quantities
// int n = M.voxel_index(i,j,0);
// (*M.p_density_vectors)[n] /= M.thomas_denomz[0];
//
// // should be an empty loop if mesh.z_coordinates.size() < 2
// for( int k=1; k < M.mesh.z_coordinates.size() ; k++ )
// {
// n = M.voxel_index(i,j,k);
// axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , (*M.p_density_vectors)[n-M.thomas_k_jump] );
// (*M.p_density_vectors)[n] /= M.thomas_denomz[k];
// }
//
// // for parallelization need to break forward elimination and back substitution into
// // should be an empty loop if mesh.z_coordinates.size() < 2
//
// for( int k = M.mesh.z_coordinates.size()-2 ; k >= 0 ; k-- )
// {
// n = M.voxel_index(i,j,k);
// naxpy( &(*M.p_density_vectors)[n] , M.thomas_cz[k] , (*M.p_density_vectors)[n+M.thomas_k_jump] );
// // n -= i_jump;
// }
// }
// }
/*----------------------------TILL HERE----------------------------------------------*/
/*-----------------------------------------------------------------------------------*/
/* CODE FOR FORWARD ELIMINATION */
/* FORWARD ELIMINATION - x DIRECTION/DECOMPOSITION */
/*-----------------------------------------------------------------------------------*/
int n1;
int x_size = M.mesh.x_coordinates.size();
/* For data packing... */
/* My direction of traversing is go up up up i.e. y direction points then go in i.e. Z-direction */
/* Remember to visualize 3-D as 2-D plates kept after one another. Hence Z-direction data is farther */
/* apart than X/Y direction */
int y_size = M.mesh.y_coordinates.size();
int z_size = M.mesh.z_coordinates.size();
int p_size = (*M.p_density_vectors)[0].size(); //All p_density_vectors elements have same size, use anyone
int snd_data_size = x_size * y_size * p_size;
int rcv_data_size = x_size * y_size * p_size; //All p_density_vectors elements have same size, use anyone
int snd_data_size = z_size * y_size * p_size; //Number of data elements to be sent
int rcv_data_size = z_size * y_size * p_size; //All p_density_vectors elements have same size, use anyone
std::vector<double> snd_data(snd_data_size);
std::vector<double> rcv_data(rcv_data_size);
std::vector< std::vector < std::vector<double> > > block3d ( y_size, std::vector<std::vector<double>> ( x_size, std::vector<double>(p_size) ) );
/*----------------------------------------------------------------------------------------------------------*/
/*Declaring a matrix with y_size rows, x_size columns and each element of matrix is a vector of size p_size.*/
/*First will store received data from rcv_data into this 3-d block because we need to pass base address */
/*of vectors in the routine axpy() */
/*----------------------------------------------------------------------------------------------------------*/
/* So row is along Z axis, column of each row is along Y-axis and each element has p_density_vector*/
for(int ser_ctr=0; ser_ctr <= size-1; ser_ctr++)
std::vector< std::vector < std::vector<double> > > block3d ( z_size, std::vector<std::vector<double> > ( y_size, std::vector<double>(p_size) ) );
//t_strt_x = MPI_Wtime();
for(int ser_ctr=0; ser_ctr <= size-1; ser_ctr++)
{
if(rank == ser_ctr)
{
if(rank == 0)
{
#pragma omp parallel for
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
for(int k=0; k<= M.mesh.z_coordinates.size()-1; k++)
{
for(int i=0; i<= M.mesh.x_coordinates.size()-1; i++)
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
{
int n = M.voxel_index(i,j,0);
(*M.p_density_vectors)[n] /= M.thomas_denomz[0];
int n = M.voxel_index(0,j,k);
(*M.p_density_vectors)[n] /= M.thomas_denomx[0];
for( int k=1; k < M.mesh.z_coordinates.size() ; k++ )
for( int i=1; i < M.mesh.x_coordinates.size() ; i++ )
{
n = M.voxel_index(i,j,k);
n1 = M.voxel_index(i,j,k-1);
int n = M.voxel_index(i,j,k);
int n1 = M.voxel_index(i-1,j,k); //Can remove this overhead of finding index now
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , (*M.p_density_vectors)[n1] );
(*M.p_density_vectors)[n] /= M.thomas_denomz[k];
(*M.p_density_vectors)[n] /= M.thomas_denomx[i];
}
}
}
}
else
{
#pragma omp parallel for
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
#pragma omp parallel for
for(int k=0; k<= M.mesh.z_coordinates.size()-1; k++)
{
for(int i=0; i<= M.mesh.x_coordinates.size()-1; i++)
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
{
int n = M.voxel_index(i,j,0); //Need to consider case separately for k=0, as k-1 would be -1 !
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , block3d[j][i]);
(*M.p_density_vectors)[n] /= M.thomas_denomz[0];
int n = M.voxel_index(0,j,k); //Need to consider case separately for k=0, as k-1 would be -1 !
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , block3d[k][j]);
(*M.p_density_vectors)[n] /= M.thomas_denomx[0];
for( int k=1; k < M.mesh.z_coordinates.size() ; k++ )
for( int i=1; i < M.mesh.x_coordinates.size() ; i++ )
{
n = M.voxel_index(i,j,k);
n1 = M.voxel_index(i,j,k-1);
int n = M.voxel_index(i,j,k);
int n1 = M.voxel_index(i-1,j,k);
axpy( &(*M.p_density_vectors)[n] , M.thomas_constant1 , (*M.p_density_vectors)[n1] );
(*M.p_density_vectors)[n] /= M.thomas_denomz[k];
(*M.p_density_vectors)[n] /= M.thomas_denomx[i];
}
}
}
}
if(rank < (size-1))
{
int z_end = M.mesh.z_coordinates.size()-1;
int x_end = M.mesh.x_coordinates.size()-1;
int ctr=0;
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
for(int k=0; k<= M.mesh.z_coordinates.size()-1; k++)
{
for(int i=0; i<= M.mesh.x_coordinates.size()-1; i++)
for(int j=0; j<= M.mesh.y_coordinates.size()-1; j++)
{