Ali Farazdaghi
ELEC 873
were Clusters with Single-core Nodes
int main(int argc, char **argv) {
int world_rank, tl;
int max_threads = omp_get_max_threads();
MPI_Comm ep_comm[max_threads];
MPI_Init_thread(&argc, &argv, MULTIPLE, &tl);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
#pragma omp parallel
{ int nt = omp_get_num_threads();
int tn = omp_get_thread_num();
int ep_rank;
#pragma omp master
MPI_Comm_create_endpoints(
MPI_COMM_WORLD, nt, MPI_INFO_NULL, ep_comm);
#pragma omp barrier
MPI_Comm_rank(ep_comm[tn], &ep_rank);
... // divide up work based on ’ep_rank’
MPI_Allreduce(..., ep_comm[tn]); // MPI_Send/Recv
MPI_Comm_free(&ep_comm[tn]);
}
MPI_Finalize();
}
More Threads → Less Msg Rate → Use less threads!
Two Queues On Receiver
Recv() called first
Msg arrived first
Instead of giving each thread communication capabilities...
Give them ownership over a chunk of memory
From: Implementation and Evaluation of MPI 4.0 Partitioned Communication Libraries
MPI_Psend_init(&buffer, partitions, count,
datatype, dest, tag, comm, info, &request);
for (i = 0; i < num_ite rations; i++) {
MPI_Start(&request) ;
/* Parallel loop with some number of threads */
parallel for (partition = 0; partition < 10; partition++) {
/* Do work to fill partition # portion of buffer */
MPI_Pready(partition, &request ) ;
}
MPI_Wait(&request);
}
From: Implementation and Evaluation of MPI 4.0 Partitioned Communication Libraries
MPI_Precv_init(&buffer, partitions, count,
datatype, source, tag, comm, info, &request); // lazy
for (i = 0; i < num_iterations; i++) {
MPI_Start(&request);
parallel for (partition = 0; partition < 10; partition++) {
/* do compute work */
MPI_Parrived(&request, partition, &flag);
/* do work on early arrivals if available */
/* if not goto next iteration or Parravied() again */
}
MPI_Test(&request, &flag, MPI_STATUS_IGNORE); // for whole buffer
}
From: Implementation and Evaluation of MPI 4.0 Partitioned Communication Libraries
Partitioned Communication has lazy initialization on receiver side
Refs are hyperlinked