blob: a858574117e3f1636a07f7ca03393c6ea6f7e261 [file] [log] [blame]
//@HEADER
// ************************************************************************
//
// HPCCG: Simple Conjugate Gradient Benchmark Code
// Copyright (2006) Sandia Corporation
//
// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
// license for use of this work by or on behalf of the U.S. Government.
//
// This library is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as
// published by the Free Software Foundation; either version 2.1 of the
// License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA
// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
//
// ************************************************************************
//@HEADER
#ifdef USING_MPI // Compile this routine only if running in parallel
#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
#include <map>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cassert>
#include "HPC_Sparse_Matrix.hpp"
#include "read_HPC_row.hpp"
#include "make_local_matrix.hpp"
#include "mytimer.hpp"
//#define DEBUG
void make_local_matrix(HPC_Sparse_Matrix * A)
{
std::map< int, int > externals;
int i, j, k;
int num_external = 0;
double t0;
int debug_details = 0; // Set to 1 for voluminous output
#ifdef DEBUG
int debug = 1;
#else
int debug = 0;
#endif
// Get MPI process info
int size, rank; // Number of MPI processes, My process ID
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Extract Matrix pieces
int start_row = A->start_row;
int stop_row = A->stop_row;
int total_nrow = A->total_nrow;
long long total_nnz = A->total_nnz;
int local_nrow = A->local_nrow;
int local_nnz = A->local_nnz;
int * nnz_in_row = A->nnz_in_row;
double ** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
int ** ptr_to_inds_in_row = A->ptr_to_inds_in_row;
// We need to convert the index values for the rows on this processor
// to a local index space. We need to:
// - Determine if each index reaches to a local value or external value
// - If local, subtract start_row from index value to get local index
// - If external, find out if it is already accounted for.
// - If so, then do nothing,
// - otherwise
// - add it to the list of external indices,
// - find out which processor owns the value.
// - Set up communication for sparse MV operation.
///////////////////////////////////////////
// Scan the indices and transform to local
///////////////////////////////////////////
if (debug) t0 = mytimer();
int *external_index = new int[max_external];
int *external_local_index = new int[max_external];
A->external_index = external_index;
A->external_local_index = external_local_index;
for (i=0; i< local_nrow; i++)
{
for (j=0; j<nnz_in_row[i]; j++)
{
int cur_ind = ptr_to_inds_in_row[i][j];
if (debug_details)
cout << "Process "<<rank<<" of "<<size<<" getting index "
<<cur_ind<<" in local row "<<i<<endl;
if (start_row <= cur_ind && cur_ind <= stop_row)
{
ptr_to_inds_in_row[i][j] -= start_row;
}
else // Must find out if we have already set up this point
{
if (externals.find(cur_ind)==externals.end())
{
externals[cur_ind] = num_external++;
if (num_external<=max_external)
{
external_index[num_external-1] = cur_ind;
// Mark index as external by negating it
ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
}
else
{
cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
<<endl;
abort();
}
}
else
{
// Mark index as external by adding 1 and negating it
ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
}
}
}
}
if (debug) {
t0 = mytimer() - t0;
cout << " Time in transform to local phase = " << t0 << endl;
cout << "Processor " << rank << " of " << size <<
": Number of external equations = " << num_external << endl;
}
////////////////////////////////////////////////////////////////////////////
// Go through list of externals to find out which processors must be accessed.
////////////////////////////////////////////////////////////////////////////
if (debug) t0 = mytimer();
A->num_external = num_external;
int * tmp_buffer = new int[size]; // Temp buffer space needed below
// Build list of global index offset
int * global_index_offsets = new int[size];
for (i=0;i<size; i++) tmp_buffer[i] = 0; // First zero out
tmp_buffer[rank] = start_row; // This is my start row
// This call sends the start_row of each ith processor to the ith
// entry of global_index_offset on all processors.
// Thus, each processor know the range of indices owned by all
// other processors.
// Note: There might be a better algorithm for doing this, but this
// will work...
MPI_Allreduce(tmp_buffer, global_index_offsets, size, MPI_INT,
MPI_SUM, MPI_COMM_WORLD);
// Go through list of externals and find the processor that owns each
int * external_processor = new int[num_external];
int * new_external_processor = new int[num_external];
for (i=0; i< num_external; i++)
{
int cur_ind = external_index[i];
for (int j=size-1; j>=0; j--)
if (global_index_offsets[j] <= cur_ind)
{
external_processor[i] = j;
break;
}
}
if (debug) {
t0 = mytimer() - t0;
cout << " Time in finding processors phase = " << t0 << endl;
}
////////////////////////////////////////////////////////////////////////////
// Sift through the external elements. For each newly encountered external
// point assign it the next index in the sequence. Then look for other
// external elements who are update by the same node and assign them the next
// set of index numbers in the sequence (ie. elements updated by the same node
// have consecutive indices).
////////////////////////////////////////////////////////////////////////////
if (debug) t0 = mytimer();
int count = local_nrow;
for (i = 0; i < num_external; i++) external_local_index[i] = -1;
for (i = 0; i < num_external; i++) {
if (external_local_index[i] == -1) {
external_local_index[i] = count++;
for (j = i + 1; j < num_external; j++) {
if (external_processor[j] == external_processor[i])
external_local_index[j] = count++;
}
}
}
if (debug) {
t0 = mytimer() - t0;
cout << " Time in scanning external indices phase = " << t0 << endl;
}
if (debug) t0 = mytimer();
for (i=0; i< local_nrow; i++)
{
for (j=0; j<nnz_in_row[i]; j++)
{
if (ptr_to_inds_in_row[i][j]<0) // Change index values of externals
{
int cur_ind = - ptr_to_inds_in_row[i][j] - 1;
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
}
}
}
for (i = 0 ; i < num_external; i++) new_external_processor[i] = 0;
for (i = 0; i < num_external; i++)
new_external_processor[external_local_index[i] - local_nrow] =
external_processor[i];
if (debug) {
t0 = mytimer() - t0;
cout << " Time in assigning external indices phase = " << t0 << endl;
}
if (debug_details)
{
for (i = 0; i < num_external; i++)
{
cout << "Processor " << rank << " of " << size <<
": external processor["<< i<< "] = " << external_processor[i]<< endl;
cout << "Processor " << rank << " of " << size <<
": new external processor["<< i<< "] = "
<< new_external_processor[i]<< endl;
}
}
////////////////////////////////////////////////////////////////////////////
///
// Count the number of neighbors from which we receive information to update
// our external elements. Additionally, fill the array tmp_neighbors in the
// following way:
// tmp_neighbors[i] = 0 ==> No external elements are updated by
// processor i.
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
// processor i.
///
////////////////////////////////////////////////////////////////////////////
t0 = mytimer();
int * tmp_neighbors = new int[size];
for (i = 0 ; i < size ; i++) tmp_neighbors[i] = 0;
int num_recv_neighbors = 0;
int length = 1;
for (i = 0; i < num_external; i++)
{
if (tmp_neighbors[new_external_processor[i]] == 0)
{
num_recv_neighbors++;
tmp_neighbors[new_external_processor[i]] = 1;
}
tmp_neighbors[new_external_processor[i]] += size;
}
/// sum over all processors all the tmp_neighbors arrays ///
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM,
MPI_COMM_WORLD);
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
// array from all the processors
int num_send_neighbors = tmp_buffer[rank] % size;
/// decode 'tmp_buffer[rank] to deduce total number of elements
// we must send
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
//
// Check to see if we have enough workspace allocated. This could be
// dynamically modified, but let's keep it simple for now...
//
if (num_send_neighbors > max_num_messages)
{
cerr << "Must increase max_num_messages in HPC_Sparse_Matrix.hpp"
<<endl;
cerr << "Must be at least " << num_send_neighbors <<endl;
abort();
}
if (total_to_be_sent > max_external )
{
cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
<<endl;
cerr << "Must be at least " << total_to_be_sent <<endl;
abort();
}
delete [] tmp_neighbors;
if (debug) {
t0 = mytimer() - t0;
cout << " Time in finding neighbors phase = " << t0 << endl;
}
if (debug) cout << "Processor " << rank << " of " << size <<
": Number of send neighbors = " << num_send_neighbors << endl;
if (debug) cout << "Processor " << rank << " of " << size <<
": Number of receive neighbors = " << num_recv_neighbors << endl;
if (debug) cout << "Processor " << rank << " of " << size <<
": Total number of elements to send = " << total_to_be_sent << endl;
if (debug) MPI_Barrier(MPI_COMM_WORLD);
/////////////////////////////////////////////////////////////////////////
///
// Make a list of the neighbors that will send information to update our
// external elements (in the order that we will receive this information).
///
/////////////////////////////////////////////////////////////////////////
int * recv_list = new int[max_external];
j = 0;
recv_list[j++] = new_external_processor[0];
for (i = 1; i < num_external; i++) {
if (new_external_processor[i - 1] != new_external_processor[i]) {
recv_list[j++] = new_external_processor[i];
}
}
//
// Send a 0 length message to each of our recv neighbors
//
int * send_list = new int[num_send_neighbors];
for (i = 0 ; i < num_send_neighbors; i++ ) send_list[i] = 0;
//
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
//
int MPI_MY_TAG = 99;
MPI_Request * request = new MPI_Request[max_num_messages];
for (i = 0; i < num_send_neighbors; i++)
{
MPI_Irecv(tmp_buffer+i, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MY_TAG,
MPI_COMM_WORLD, request+i);
}
// send messages
for (i = 0; i < num_recv_neighbors; i++)
MPI_Send(tmp_buffer+i, 1, MPI_INT, recv_list[i], MPI_MY_TAG,
MPI_COMM_WORLD);
///
// Receive message from each send neighbor to construct 'send_list'.
///
MPI_Status status;
for (i = 0; i < num_send_neighbors; i++)
{
if ( MPI_Wait(request+i, &status) )
{
cerr << "MPI_Wait error\n"<<endl;
exit(-1);
}
send_list[i] = status.MPI_SOURCE;
}
/////////////////////////////////////////////////////////////////////////
///
// Compare the two lists. In most cases they should be the same.
// However, if they are not then add new entries to the recv list
// that are in the send list (but not already in the recv list).
///
/////////////////////////////////////////////////////////////////////////
for (j = 0; j < num_send_neighbors; j++)
{
int found = 0;
for (i = 0; i < num_recv_neighbors; i++)
{
if (recv_list[i] == send_list[j])
found = 1;
}
if (found == 0) {
if (debug) cout << "Processor " << rank << " of " << size <<
": recv_list[" << num_recv_neighbors <<"] = "
<< send_list[j] << endl;
recv_list[num_recv_neighbors] = send_list[j];
(num_recv_neighbors)++;
}
}
delete [] send_list;
num_send_neighbors = num_recv_neighbors;
if (num_send_neighbors > max_num_messages)
{
cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
<<endl;
abort();
}
/////////////////////////////////////////////////////////////////////////
/// Start filling HPC_Sparse_Matrix struct
/////////////////////////////////////////////////////////////////////////
A->total_to_be_sent = total_to_be_sent;
int * elements_to_send = new int[total_to_be_sent];
A->elements_to_send = elements_to_send;
for (i = 0 ; i < total_to_be_sent; i++ ) elements_to_send[i] = 0;
//
// Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index'
//
int * new_external = new int[num_external];
for (i = 0; i < num_external; i++) {
new_external[external_local_index[i] - local_nrow] = external_index[i];
}
/////////////////////////////////////////////////////////////////////////
//
// Send each processor the global index list of the external elements in the
// order that I will want to receive them when updating my external elements
//
/////////////////////////////////////////////////////////////////////////
int * lengths = new int[num_recv_neighbors];
MPI_MY_TAG++;
// First post receives
for (i = 0; i < num_recv_neighbors; i++)
{
int partner = recv_list[i];
MPI_Irecv(lengths+i, 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
request+i);
}
int * neighbors = new int[max_num_neighbors];
int * recv_length = new int[max_num_neighbors];
int * send_length = new int[max_num_neighbors];
A->neighbors = neighbors;
A->recv_length = recv_length;
A->send_length = send_length;
j = 0;
for (i = 0; i < num_recv_neighbors; i++)
{
int start = j;
int newlength = 0;
// go through list of external elements until updating
// processor changes
while ((j < num_external) &&
(new_external_processor[j] == recv_list[i]))
{
newlength++;
j++;
if (j == num_external) break;
}
recv_length[i] = newlength;
neighbors[i] = recv_list[i];
length = j - start;
MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
}
// Complete the receives of the number of externals
for (i = 0; i < num_recv_neighbors; i++)
{
if ( MPI_Wait(request+i, &status) )
{
cerr << "MPI_Wait error\n"<<endl;
exit(-1);
}
send_length[i] = lengths[i];
}
delete [] lengths;
///////////////////////////////////////////////////////////////////
// Build "elements_to_send" list. These are the x elements I own
// that need to be sent to other processors.
///////////////////////////////////////////////////////////////////
MPI_MY_TAG++;
j = 0;
for (i = 0; i < num_recv_neighbors; i++)
{
MPI_Irecv(elements_to_send+j, send_length[i], MPI_INT, neighbors[i],
MPI_MY_TAG, MPI_COMM_WORLD, request+i);
j += send_length[i];
}
j = 0;
for (i = 0; i < num_recv_neighbors; i++)
{
int start = j;
int newlength = 0;
// Go through list of external elements
// until updating processor changes. This is redundant, but
// saves us from recording this information.
while ((j < num_external) &&
(new_external_processor[j] == recv_list[i])) {
newlength++;
j++;
if (j == num_external) break;
}
MPI_Send(new_external+start, j-start, MPI_INT, recv_list[i],
MPI_MY_TAG, MPI_COMM_WORLD);
}
// receive from each neighbor the global index list of external elements
for (i = 0; i < num_recv_neighbors; i++)
{
if ( MPI_Wait(request+i, &status) )
{
cerr << "MPI_Wait error\n"<<endl;
exit(-1);
}
}
/// replace global indices by local indices ///
for (i = 0; i < total_to_be_sent; i++) elements_to_send[i] -= start_row;
////////////////
// Finish up !!
////////////////
A->num_send_neighbors = num_send_neighbors;
A->local_ncol = A->local_nrow + num_external;
//Used in exchange_externals
double *send_buffer = new double[total_to_be_sent];
A->send_buffer = send_buffer;
delete [] tmp_buffer;
delete [] global_index_offsets;
delete [] recv_list;
delete [] external_processor;
delete [] new_external;
delete [] new_external_processor;
delete [] request;
return;
}
#endif // USING_MPI