MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/make_local_matrix.hpp - third_party/llvm-test-suite - Git at Google

 #ifndef _make_local_matrix_hpp_
 #define _make_local_matrix_hpp_
 #include <assert.h>

 //@HEADER
 // ************************************************************************
 //
 // MiniFE: Simple Finite Element Assembly and Solve
 // Copyright (2006-2013) Sandia Corporation
 //
 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 // license for use of this work by or on behalf of the U.S. Government.
 //
 // This library is free software; you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as
 // published by the Free Software Foundation; either version 2.1 of the
 // License, or (at your option) any later version.
 //
 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // Lesser General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 // USA
 //
 // ************************************************************************
 //@HEADER

 #include <utils.hpp>

 #include <map>

 #ifdef HAVE_MPI
 #include <mpi.h>
 #endif

 namespace miniFE {

 template<typename MatrixType>
 void
 make_local_matrix(MatrixType& A)
 {
 #ifdef HAVE_MPI
   int numprocs = 1, myproc = 0;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);

   if (numprocs < 2) {
     A.num_cols = A.rows.size();
     A.has_local_indices = true;
     return;
   }

   typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
   typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
   typedef typename MatrixType::ScalarType Scalar;

   std::map<GlobalOrdinal,GlobalOrdinal> externals;
   LocalOrdinal num_external = 0;

   //Extract Matrix pieces

   size_t local_nrow = A.rows.size();
   GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
   GlobalOrdinal stop_row  = local_nrow>0 ? A.rows[local_nrow-1] : -1;

   // We need to convert the index values for the rows on this processor
   // to a local index space. We need to:
   // - Determine if each index reaches to a local value or external value
   // - If local, subtract start_row from index value to get local index
   // - If external, find out if it is already accounted for.
   //   - If so, then do nothing,
   //   - otherwise
   //     - add it to the list of external indices,
   //     - find out which processor owns the value.
   //     - Set up communication for sparse MV operation

   ///////////////////////////////////////////
   // Scan the indices and transform to local
   ///////////////////////////////////////////

   std::vector<GlobalOrdinal>& external_index = A.external_index;

   for(size_t i=0; i<A.rows.size(); ++i) {
     GlobalOrdinal* Acols = NULL;
     Scalar* Acoefs = NULL;
     size_t row_len = 0;
     A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);

     for(size_t j=0; j<row_len; ++j) {
       GlobalOrdinal cur_ind = Acols[j];
       if (start_row <= cur_ind && cur_ind <= stop_row) {
         Acols[j] -= start_row;
       }
       else { // Must find out if we have already set up this point
         if (externals.find(cur_ind) == externals.end()) {
           externals[cur_ind] = num_external++;
           external_index.push_back(cur_ind);
         }
         // Mark index as external by adding 1 and negating it
         Acols[j] = -(Acols[j] + 1);
       }
     }
   }

   ////////////////////////////////////////////////////////////////////////
   // Go through list of externals to find out which processors must be accessed.
   ////////////////////////////////////////////////////////////////////////

   std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below

   // Build list of global index offset

   std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);

   tmp_buffer[myproc] = start_row; // This is my start row

   // This call sends the start_row of each ith processor to the ith
   // entry of global_index_offsets on all processors.
   // Thus, each processor knows the range of indices owned by all
   // other processors.
   // Note: There might be a better algorithm for doing this, but this
   //       will work...

   MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
   MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
                 MPI_SUM, MPI_COMM_WORLD);

   // Go through list of externals and find the processor that owns each
   std::vector<int> external_processor(num_external);

   for(LocalOrdinal i=0; i<num_external; ++i) {
     GlobalOrdinal cur_ind = external_index[i];
     for(int j=numprocs-1; j>=0; --j) {
       if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
         external_processor[i] = j;
         break;
       }
     }
   }

   /////////////////////////////////////////////////////////////////////////
   // Sift through the external elements. For each newly encountered external
   // point assign it the next index in the sequence. Then look for other
   // external elements who are updated by the same node and assign them the next
   // set of index numbers in the sequence (ie. elements updated by the same node
   // have consecutive indices).
   /////////////////////////////////////////////////////////////////////////

   size_t count = local_nrow;
   std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
   external_local_index.assign(num_external, -1);

   for(LocalOrdinal i=0; i<num_external; ++i) {
     if (external_local_index[i] == -1) {
       external_local_index[i] = count++;

       for(LocalOrdinal j=i+1; j<num_external; ++j) {
         if (external_processor[j] == external_processor[i])
           external_local_index[j] = count++;
       }
     }
   }

   for(size_t i=0; i<local_nrow; ++i) {
     GlobalOrdinal* Acols = NULL;
     Scalar* Acoefs = NULL;
     size_t row_len = 0;
     A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);

     for(size_t j=0; j<row_len; ++j) {
       if (Acols[j] < 0) { // Change index values of externals
         GlobalOrdinal cur_ind = -Acols[j] - 1;
         Acols[j] = external_local_index[externals[cur_ind]];
       }
     }
   }

   std::vector<int> new_external_processor(num_external, 0);

   for(int i=0; i<num_external; ++i) {
     new_external_processor[external_local_index[i]-local_nrow] =
       external_processor[i];
   }

   ////////////////////////////////////////////////////////////////////////
   ///
   // Count the number of neighbors from which we receive information to update
   // our external elements. Additionally, fill the array tmp_neighbors in the
   // following way:
   //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
   //                              processor i.
   //      tmp_neighbors[i] = x   ==>  (x-1)/numprocs elements are updated from
   //                              processor i.
   ///
   ////////////////////////////////////////////////////////////////////////

   std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);

   int num_recv_neighbors = 0;
   int length             = 1;

   for(LocalOrdinal i=0; i<num_external; ++i) {
     if (tmp_neighbors[new_external_processor[i]] == 0) {
       ++num_recv_neighbors;
       tmp_neighbors[new_external_processor[i]] = 1;
     }
     tmp_neighbors[new_external_processor[i]] += numprocs;
   }

   /// sum over all processor all the tmp_neighbors arrays ///

   MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
                 MPI_SUM, MPI_COMM_WORLD);

   // decode the combined 'tmp_neighbors' (stored in tmp_buffer)
   // array from all the processors

   GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;

   /// decode 'tmp_buffer[myproc] to deduce total number of elements
   //  we must send

   GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;

   ///////////////////////////////////////////////////////////////////////
   ///
   // Make a list of the neighbors that will send information to update our
   // external elements (in the order that we will receive this information).
   ///
   ///////////////////////////////////////////////////////////////////////

   std::vector<int> recv_list;
   recv_list.push_back(new_external_processor[0]);
   for(LocalOrdinal i=1; i<num_external; ++i) {
     if (new_external_processor[i-1] != new_external_processor[i]) {
       recv_list.push_back(new_external_processor[i]);
     }
   }

   //
   // Send a 0 length message to each of our recv neighbors
   //

   std::vector<int> send_list(num_send_neighbors, 0);

   //
   // first post receives, these are immediate receives
   // Do not wait for result to come, will do that at the
   // wait call below.
   //
   int MPI_MY_TAG = 99;

   std::vector<MPI_Request> request(num_send_neighbors);
   for(int i=0; i<num_send_neighbors; ++i) {
     MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
               MPI_COMM_WORLD, &request[i]);
   }

   // send messages

   for(int i=0; i<num_recv_neighbors; ++i) {
     MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
              MPI_COMM_WORLD);
   }

   ///
   // Receive message from each send neighbor to construct 'send_list'.
   ///

   MPI_Status status;
   for(int i=0; i<num_send_neighbors; ++i) {
     if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
       std::cerr << "MPI_Wait error\n"<<std::endl;
       MPI_Abort(MPI_COMM_WORLD, -1);
     }
     send_list[i] = status.MPI_SOURCE;
   }

   //////////////////////////////////////////////////////////////////////
   ///
   // Compare the two lists. In most cases they should be the same.
   // However, if they are not then add new entries to the recv list
   // that are in the send list (but not already in the recv list).
   ///
   //////////////////////////////////////////////////////////////////////

   for(int j=0; j<num_send_neighbors; ++j) {
     int found = 0;
     for(int i=0; i<num_recv_neighbors; ++i) {
       if (recv_list[i] == send_list[j]) found = 1;
     }

     if (found == 0) {
       recv_list.push_back(send_list[j]);
       ++num_recv_neighbors;
     }
   }

   num_send_neighbors = num_recv_neighbors;
   request.resize(num_send_neighbors);

   A.elements_to_send.assign(total_to_be_sent, 0);
   A.send_buffer.assign(total_to_be_sent, 0);

   //
   // Create 'new_external' which explicitly put the external elements in the
   // order given by 'external_local_index'
   //

   std::vector<GlobalOrdinal> new_external(num_external);
   for(LocalOrdinal i=0; i<num_external; ++i) {
     new_external[external_local_index[i] - local_nrow] = external_index[i];
   }

   /////////////////////////////////////////////////////////////////////////
   //
   // Send each processor the global index list of the external elements in the
   // order that I will want to receive them when updating my external elements.
   //
   /////////////////////////////////////////////////////////////////////////

   std::vector<int> lengths(num_recv_neighbors);

   ++MPI_MY_TAG;

   // First post receives

   for(int i=0; i<num_recv_neighbors; ++i) {
     int partner = recv_list[i];
     MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
               &request[i]);
   }

   std::vector<int>& neighbors = A.neighbors;
   std::vector<int>& recv_length = A.recv_length;
   std::vector<int>& send_length = A.send_length;

   neighbors.resize(num_recv_neighbors, 0);
   A.request.resize(num_recv_neighbors);
   recv_length.resize(num_recv_neighbors, 0);
   send_length.resize(num_recv_neighbors, 0);

   LocalOrdinal j = 0;
   for(int i=0; i<num_recv_neighbors; ++i) {
     int start = j;
     int newlength = 0;

     //go through list of external elements until updating
     //processor changes

     while((j < num_external) &&
           (new_external_processor[j] == recv_list[i])) {
       ++newlength;
       ++j;
       if (j == num_external) break;
     }

     recv_length[i] = newlength;
     neighbors[i] = recv_list[i];

     length = j - start;
     MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
   }

   // Complete the receives of the number of externals

   for(int i=0; i<num_recv_neighbors; ++i) {
     if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
       std::cerr << "MPI_Wait error\n"<<std::endl;
       MPI_Abort(MPI_COMM_WORLD, -1);
     }
     send_length[i] = lengths[i];
   }

   ////////////////////////////////////////////////////////////////////////
   // Build "elements_to_send" list. These are the x elements I own
   // that need to be sent to other processors.
   ////////////////////////////////////////////////////////////////////////

   ++MPI_MY_TAG;

   j = 0;
   for(int i=0; i<num_recv_neighbors; ++i) {
     MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
               MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
     j += send_length[i];
   }

   j = 0;
   for(int i=0; i<num_recv_neighbors; ++i) {
     LocalOrdinal start = j;
     LocalOrdinal newlength = 0;

     // Go through list of external elements
     // until updating processor changes. This is redundant, but
     // saves us from recording this information.

     while((j < num_external) &&
           (new_external_processor[j] == recv_list[i])) {
       ++newlength;
       ++j;
       if (j == num_external) break;
     }
     MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
              MPI_MY_TAG, MPI_COMM_WORLD);
   }

   // receive from each neighbor the global index list of external elements

   for(int i=0; i<num_recv_neighbors; ++i) {
     if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
       std::cerr << "MPI_Wait error\n"<<std::endl;
       MPI_Abort(MPI_COMM_WORLD, -1);
     }
   }

   /// replace global indices by local indices ///

   for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
     A.elements_to_send[i] -= start_row;
     if (A.elements_to_send[i] >= A.rows.size()) {
       std::cout<<"start_row: "<<start_row<<", A.elements_to_send[i]: "<<A.elements_to_send[i]<<", A.rows.size(): "<<A.rows.size()<<std::endl;
     assert(A.elements_to_send[i] < A.rows.size());
     }
   }

   //////////////////
   // Finish up !!
   //////////////////

   A.num_cols = local_nrow + num_external;

 #else
   A.num_cols = A.rows.size();
 #endif

   A.has_local_indices = true;
 }

 }//namespace miniFE

 #endif
	#ifndef _make_local_matrix_hpp_
	#define _make_local_matrix_hpp_
	#include <assert.h>

	//@HEADER
	// ************************************************************************
	//
	// MiniFE: Simple Finite Element Assembly and Solve
	// Copyright (2006-2013) Sandia Corporation
	//
	// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
	// license for use of this work by or on behalf of the U.S. Government.
	//
	// This library is free software; you can redistribute it and/or modify
	// it under the terms of the GNU Lesser General Public License as
	// published by the Free Software Foundation; either version 2.1 of the
	// License, or (at your option) any later version.
	//
	// This library is distributed in the hope that it will be useful, but
	// WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// Lesser General Public License for more details.
	//
	// You should have received a copy of the GNU Lesser General Public
	// License along with this library; if not, write to the Free Software
	// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	// USA
	//
	// ************************************************************************
	//@HEADER

	#include <utils.hpp>

	#include <map>

	#ifdef HAVE_MPI
	#include <mpi.h>
	#endif

	namespace miniFE {

	template<typename MatrixType>
	void
	make_local_matrix(MatrixType& A)
	{
	#ifdef HAVE_MPI
	int numprocs = 1, myproc = 0;
	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &myproc);

	if (numprocs < 2) {
	A.num_cols = A.rows.size();
	A.has_local_indices = true;
	return;
	}

	typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
	typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
	typedef typename MatrixType::ScalarType Scalar;

	std::map<GlobalOrdinal,GlobalOrdinal> externals;
	LocalOrdinal num_external = 0;

	//Extract Matrix pieces

	size_t local_nrow = A.rows.size();
	GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
	GlobalOrdinal stop_row = local_nrow>0 ? A.rows[local_nrow-1] : -1;

	// We need to convert the index values for the rows on this processor
	// to a local index space. We need to:
	// - Determine if each index reaches to a local value or external value
	// - If local, subtract start_row from index value to get local index
	// - If external, find out if it is already accounted for.
	// - If so, then do nothing,
	// - otherwise
	// - add it to the list of external indices,
	// - find out which processor owns the value.
	// - Set up communication for sparse MV operation

	///////////////////////////////////////////
	// Scan the indices and transform to local
	///////////////////////////////////////////

	std::vector<GlobalOrdinal>& external_index = A.external_index;

	for(size_t i=0; i<A.rows.size(); ++i) {
	GlobalOrdinal* Acols = NULL;
	Scalar* Acoefs = NULL;
	size_t row_len = 0;
	A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);

	for(size_t j=0; j<row_len; ++j) {
	GlobalOrdinal cur_ind = Acols[j];
	if (start_row <= cur_ind && cur_ind <= stop_row) {
	Acols[j] -= start_row;
	}
	else { // Must find out if we have already set up this point
	if (externals.find(cur_ind) == externals.end()) {
	externals[cur_ind] = num_external++;
	external_index.push_back(cur_ind);
	}
	// Mark index as external by adding 1 and negating it
	Acols[j] = -(Acols[j] + 1);
	}
	}
	}

	////////////////////////////////////////////////////////////////////////
	// Go through list of externals to find out which processors must be accessed.
	////////////////////////////////////////////////////////////////////////

	std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below

	// Build list of global index offset

	std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);

	tmp_buffer[myproc] = start_row; // This is my start row

	// This call sends the start_row of each ith processor to the ith
	// entry of global_index_offsets on all processors.
	// Thus, each processor knows the range of indices owned by all
	// other processors.
	// Note: There might be a better algorithm for doing this, but this
	// will work...

	MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
	MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
	MPI_SUM, MPI_COMM_WORLD);

	// Go through list of externals and find the processor that owns each
	std::vector<int> external_processor(num_external);

	for(LocalOrdinal i=0; i<num_external; ++i) {
	GlobalOrdinal cur_ind = external_index[i];
	for(int j=numprocs-1; j>=0; --j) {
	if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
	external_processor[i] = j;
	break;
	}
	}
	}

	/////////////////////////////////////////////////////////////////////////
	// Sift through the external elements. For each newly encountered external
	// point assign it the next index in the sequence. Then look for other
	// external elements who are updated by the same node and assign them the next
	// set of index numbers in the sequence (ie. elements updated by the same node
	// have consecutive indices).
	/////////////////////////////////////////////////////////////////////////

	size_t count = local_nrow;
	std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
	external_local_index.assign(num_external, -1);

	for(LocalOrdinal i=0; i<num_external; ++i) {
	if (external_local_index[i] == -1) {
	external_local_index[i] = count++;

	for(LocalOrdinal j=i+1; j<num_external; ++j) {
	if (external_processor[j] == external_processor[i])
	external_local_index[j] = count++;
	}
	}
	}

	for(size_t i=0; i<local_nrow; ++i) {
	GlobalOrdinal* Acols = NULL;
	Scalar* Acoefs = NULL;
	size_t row_len = 0;
	A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);

	for(size_t j=0; j<row_len; ++j) {
	if (Acols[j] < 0) { // Change index values of externals
	GlobalOrdinal cur_ind = -Acols[j] - 1;
	Acols[j] = external_local_index[externals[cur_ind]];
	}
	}
	}

	std::vector<int> new_external_processor(num_external, 0);

	for(int i=0; i<num_external; ++i) {
	new_external_processor[external_local_index[i]-local_nrow] =
	external_processor[i];
	}

	////////////////////////////////////////////////////////////////////////
	///
	// Count the number of neighbors from which we receive information to update
	// our external elements. Additionally, fill the array tmp_neighbors in the
	// following way:
	// tmp_neighbors[i] = 0 ==> No external elements are updated by
	// processor i.
	// tmp_neighbors[i] = x ==> (x-1)/numprocs elements are updated from
	// processor i.
	///
	////////////////////////////////////////////////////////////////////////

	std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);

	int num_recv_neighbors = 0;
	int length = 1;

	for(LocalOrdinal i=0; i<num_external; ++i) {
	if (tmp_neighbors[new_external_processor[i]] == 0) {
	++num_recv_neighbors;
	tmp_neighbors[new_external_processor[i]] = 1;
	}
	tmp_neighbors[new_external_processor[i]] += numprocs;
	}

	/// sum over all processor all the tmp_neighbors arrays ///

	MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
	MPI_SUM, MPI_COMM_WORLD);

	// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
	// array from all the processors

	GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;

	/// decode 'tmp_buffer[myproc] to deduce total number of elements
	// we must send

	GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;

	///////////////////////////////////////////////////////////////////////
	///
	// Make a list of the neighbors that will send information to update our
	// external elements (in the order that we will receive this information).
	///
	///////////////////////////////////////////////////////////////////////

	std::vector<int> recv_list;
	recv_list.push_back(new_external_processor[0]);
	for(LocalOrdinal i=1; i<num_external; ++i) {
	if (new_external_processor[i-1] != new_external_processor[i]) {
	recv_list.push_back(new_external_processor[i]);
	}
	}

	//
	// Send a 0 length message to each of our recv neighbors
	//

	std::vector<int> send_list(num_send_neighbors, 0);

	//
	// first post receives, these are immediate receives
	// Do not wait for result to come, will do that at the
	// wait call below.
	//
	int MPI_MY_TAG = 99;

	std::vector<MPI_Request> request(num_send_neighbors);
	for(int i=0; i<num_send_neighbors; ++i) {
	MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
	MPI_COMM_WORLD, &request[i]);
	}

	// send messages

	for(int i=0; i<num_recv_neighbors; ++i) {
	MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
	MPI_COMM_WORLD);
	}

	///
	// Receive message from each send neighbor to construct 'send_list'.
	///

	MPI_Status status;
	for(int i=0; i<num_send_neighbors; ++i) {
	if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
	std::cerr << "MPI_Wait error\n"<<std::endl;
	MPI_Abort(MPI_COMM_WORLD, -1);
	}
	send_list[i] = status.MPI_SOURCE;
	}

	//////////////////////////////////////////////////////////////////////
	///
	// Compare the two lists. In most cases they should be the same.
	// However, if they are not then add new entries to the recv list
	// that are in the send list (but not already in the recv list).
	///
	//////////////////////////////////////////////////////////////////////

	for(int j=0; j<num_send_neighbors; ++j) {
	int found = 0;
	for(int i=0; i<num_recv_neighbors; ++i) {
	if (recv_list[i] == send_list[j]) found = 1;
	}

	if (found == 0) {
	recv_list.push_back(send_list[j]);
	++num_recv_neighbors;
	}
	}

	num_send_neighbors = num_recv_neighbors;
	request.resize(num_send_neighbors);

	A.elements_to_send.assign(total_to_be_sent, 0);
	A.send_buffer.assign(total_to_be_sent, 0);

	//
	// Create 'new_external' which explicitly put the external elements in the
	// order given by 'external_local_index'
	//

	std::vector<GlobalOrdinal> new_external(num_external);
	for(LocalOrdinal i=0; i<num_external; ++i) {
	new_external[external_local_index[i] - local_nrow] = external_index[i];
	}

	/////////////////////////////////////////////////////////////////////////
	//
	// Send each processor the global index list of the external elements in the
	// order that I will want to receive them when updating my external elements.
	//
	/////////////////////////////////////////////////////////////////////////

	std::vector<int> lengths(num_recv_neighbors);

	++MPI_MY_TAG;

	// First post receives

	for(int i=0; i<num_recv_neighbors; ++i) {
	int partner = recv_list[i];
	MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
	&request[i]);
	}

	std::vector<int>& neighbors = A.neighbors;
	std::vector<int>& recv_length = A.recv_length;
	std::vector<int>& send_length = A.send_length;

	neighbors.resize(num_recv_neighbors, 0);
	A.request.resize(num_recv_neighbors);
	recv_length.resize(num_recv_neighbors, 0);
	send_length.resize(num_recv_neighbors, 0);

	LocalOrdinal j = 0;
	for(int i=0; i<num_recv_neighbors; ++i) {
	int start = j;
	int newlength = 0;

	//go through list of external elements until updating
	//processor changes

	while((j < num_external) &&
	(new_external_processor[j] == recv_list[i])) {
	++newlength;
	++j;
	if (j == num_external) break;
	}

	recv_length[i] = newlength;
	neighbors[i] = recv_list[i];

	length = j - start;
	MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
	}

	// Complete the receives of the number of externals

	for(int i=0; i<num_recv_neighbors; ++i) {
	if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
	std::cerr << "MPI_Wait error\n"<<std::endl;
	MPI_Abort(MPI_COMM_WORLD, -1);
	}
	send_length[i] = lengths[i];
	}

	////////////////////////////////////////////////////////////////////////
	// Build "elements_to_send" list. These are the x elements I own
	// that need to be sent to other processors.
	////////////////////////////////////////////////////////////////////////

	++MPI_MY_TAG;

	j = 0;
	for(int i=0; i<num_recv_neighbors; ++i) {
	MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
	MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
	j += send_length[i];
	}

	j = 0;
	for(int i=0; i<num_recv_neighbors; ++i) {
	LocalOrdinal start = j;
	LocalOrdinal newlength = 0;

	// Go through list of external elements
	// until updating processor changes. This is redundant, but
	// saves us from recording this information.

	while((j < num_external) &&
	(new_external_processor[j] == recv_list[i])) {
	++newlength;
	++j;
	if (j == num_external) break;
	}
	MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
	MPI_MY_TAG, MPI_COMM_WORLD);
	}

	// receive from each neighbor the global index list of external elements

	for(int i=0; i<num_recv_neighbors; ++i) {
	if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
	std::cerr << "MPI_Wait error\n"<<std::endl;
	MPI_Abort(MPI_COMM_WORLD, -1);
	}
	}

	/// replace global indices by local indices ///

	for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
	A.elements_to_send[i] -= start_row;
	if (A.elements_to_send[i] >= A.rows.size()) {
	std::cout<<"start_row: "<<start_row<<", A.elements_to_send[i]: "<<A.elements_to_send[i]<<", A.rows.size(): "<<A.rows.size()<<std::endl;
	assert(A.elements_to_send[i] < A.rows.size());
	}
	}

	//////////////////
	// Finish up !!
	//////////////////

	A.num_cols = local_nrow + num_external;

	#else
	A.num_cols = A.rows.size();
	#endif

	A.has_local_indices = true;
	}

	}//namespace miniFE

	#endif