MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG/make_local_matrix.cpp - third_party/llvm-test-suite - Git at Google


 //@HEADER
 // ************************************************************************
 //
 //               HPCCG: Simple Conjugate Gradient Benchmark Code
 //                 Copyright (2006) Sandia Corporation
 //
 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 // license for use of this work by or on behalf of the U.S. Government.
 //
 // This library is free software; you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as
 // published by the Free Software Foundation; either version 2.1 of the
 // License, or (at your option) any later version.
 //
 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // Lesser General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 // USA
 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
 //
 // ************************************************************************
 //@HEADER

 #ifdef USING_MPI  // Compile this routine only if running in parallel
 #include <iostream>
 using std::cout;
 using std::cerr;
 using std::endl;
 #include <map>
 #include <algorithm>
 #include <cstdlib>
 #include <cstdio>
 #include <cassert>
 #include "HPC_Sparse_Matrix.hpp"
 #include "read_HPC_row.hpp"
 #include "make_local_matrix.hpp"
 #include "mytimer.hpp"
 //#define DEBUG
 void make_local_matrix(HPC_Sparse_Matrix * A)
 {
   std::map< int, int > externals;
   int i, j, k;
   int num_external = 0;
   double t0;

   int debug_details = 0; // Set to 1 for voluminous output
 #ifdef DEBUG
   int debug = 1;
 #else
   int debug = 0;
 #endif

   // Get MPI process info

   int size, rank; // Number of MPI processes, My process ID
   MPI_Comm_size(MPI_COMM_WORLD, &size);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);


   // Extract Matrix pieces

   int start_row = A->start_row;
   int stop_row = A->stop_row;
   int total_nrow = A->total_nrow;
   long long total_nnz = A->total_nnz;
   int local_nrow = A->local_nrow;
   int local_nnz = A->local_nnz;
   int  * nnz_in_row = A->nnz_in_row;
   double ** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
   int ** ptr_to_inds_in_row = A->ptr_to_inds_in_row;


   // We need to convert the index values for the rows on this processor
   // to a local index space. We need to:
   // - Determine if each index reaches to a local value or external value
   // - If local, subtract start_row from index value to get local index
   // - If external, find out if it is already accounted for.
   //   - If so, then do nothing,
   //   - otherwise
   //     - add it to the list of external indices,
   //     - find out which processor owns the value.
   //     - Set up communication for sparse MV operation.


   ///////////////////////////////////////////
   // Scan the indices and transform to local
     ///////////////////////////////////////////

   if (debug) t0 = mytimer();

   int *external_index = new int[max_external];
   int *external_local_index = new int[max_external];
   A->external_index = external_index;
   A->external_local_index = external_local_index;

   for (i=0; i< local_nrow; i++)
     {
       for (j=0; j<nnz_in_row[i]; j++)
 	{
 	  int cur_ind = ptr_to_inds_in_row[i][j];
 	  if (debug_details)
 	    cout << "Process "<<rank<<" of "<<size<<" getting index "
 		 <<cur_ind<<" in local row "<<i<<endl;
 	  if (start_row <= cur_ind && cur_ind <= stop_row)
 	    {
 	      ptr_to_inds_in_row[i][j] -= start_row;
 	    }
 	  else // Must find out if we have already set up this point
 	    {
 	      if (externals.find(cur_ind)==externals.end())
 		{
 		  externals[cur_ind] = num_external++;
 		  if (num_external<=max_external)
 		    {
 		      external_index[num_external-1] = cur_ind;
 		      // Mark index as external by negating it
 		      ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
 		    }
 		  else
 		    {
 		      cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
 			   <<endl;
 		      abort();
 		    }
 		}
 	      else
 		{
 		  // Mark index as external by adding 1 and negating it
 		  ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
 		}
 	    }
 	}
     }

   if (debug) {
     t0 = mytimer() - t0;
     cout << "            Time in transform to local phase = " << t0 << endl;
     cout << "Processor " << rank << " of " << size <<
 	       ": Number of external equations = " << num_external << endl;
   }

   ////////////////////////////////////////////////////////////////////////////
   // Go through list of externals to find out which processors must be accessed.
   ////////////////////////////////////////////////////////////////////////////

   if (debug) t0 = mytimer();

   A->num_external = num_external;
   int * tmp_buffer  = new int[size];  // Temp buffer space needed below

   // Build list of global index offset

   int * global_index_offsets = new int[size];
   for (i=0;i<size; i++) tmp_buffer[i] = 0;  // First zero out

   tmp_buffer[rank] = start_row; // This is my start row

   // This call sends the start_row of each ith processor to the ith
   // entry of global_index_offset on all processors.
   // Thus, each processor know the range of indices owned by all
   // other processors.
   // Note:  There might be a better algorithm for doing this, but this
   //        will work...

   MPI_Allreduce(tmp_buffer, global_index_offsets, size, MPI_INT,
 		MPI_SUM, MPI_COMM_WORLD);

   // Go through list of externals and find the processor that owns each
   int * external_processor = new int[num_external];
   int * new_external_processor = new int[num_external];

   for (i=0; i< num_external; i++)
     {
       int cur_ind = external_index[i];
       for (int j=size-1; j>=0; j--)
 	if (global_index_offsets[j] <= cur_ind)
 	  {
 	    external_processor[i] = j;
 	    break;
 	  }
     }
   if (debug) {
     t0 = mytimer() - t0;
     cout << "          Time in finding processors phase = " << t0 << endl;
   }


   ////////////////////////////////////////////////////////////////////////////
   // Sift through the external elements. For each newly encountered external
   // point assign it the next index in the sequence. Then look for other
   // external elements who are update by the same node and assign them the next
   // set of index numbers in the sequence (ie. elements updated by the same node
   // have consecutive indices).
   ////////////////////////////////////////////////////////////////////////////

   if (debug) t0 = mytimer();

   int count = local_nrow;
   for (i = 0; i < num_external; i++) external_local_index[i] = -1;

   for (i = 0; i < num_external; i++) {
     if (external_local_index[i] == -1) {
       external_local_index[i] = count++;

       for (j = i + 1; j < num_external; j++) {
         if (external_processor[j] == external_processor[i])
 	  external_local_index[j] = count++;
       }
     }
   }

   if (debug) {
     t0 = mytimer() - t0;
     cout << "           Time in scanning external indices phase = " << t0 << endl;
   }
   if (debug) t0 = mytimer();


   for (i=0; i< local_nrow; i++)
     {
       for (j=0; j<nnz_in_row[i]; j++)
 	{
 	  if (ptr_to_inds_in_row[i][j]<0) // Change index values of externals
 	    {
 	      int cur_ind = - ptr_to_inds_in_row[i][j] - 1;
 	      ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
 	    }
 	}
     }

   for (i = 0 ; i < num_external; i++)  new_external_processor[i] = 0;

   for (i = 0; i < num_external; i++)
       new_external_processor[external_local_index[i] - local_nrow] =
 	external_processor[i];

   if (debug) {
     t0 = mytimer() - t0;
     cout << "           Time in assigning external indices phase = " << t0 << endl;
   }

   if (debug_details)
     {
       for (i = 0; i < num_external; i++)
 	{
 	  cout << "Processor " << rank << " of " << size <<
 	    ": external processor["<< i<< "] = " << external_processor[i]<< endl;
 	  cout << "Processor " << rank << " of " << size <<
 	    ": new external processor["<< i<< "] = "
 	       << new_external_processor[i]<< endl;
 	}
     }

   ////////////////////////////////////////////////////////////////////////////
   ///
   // Count the number of neighbors from which we receive information to update
   // our external elements. Additionally, fill the array tmp_neighbors in the
   // following way:
   //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
   //                              processor i.
   //      tmp_neighbors[i] = x   ==>  (x-1)/size elements are updated from
   //                              processor i.
   ///
   ////////////////////////////////////////////////////////////////////////////

   t0 = mytimer();
   int * tmp_neighbors = new int[size];
   for (i = 0 ; i < size ; i++)  tmp_neighbors[i] = 0;

   int num_recv_neighbors = 0;
   int length             = 1;

   for (i = 0; i < num_external; i++)
     {
       if (tmp_neighbors[new_external_processor[i]] == 0)
 	{
 	  num_recv_neighbors++;
 	  tmp_neighbors[new_external_processor[i]] = 1;
 	}
       tmp_neighbors[new_external_processor[i]] += size;
     }

   /// sum over all processors all the tmp_neighbors arrays ///

   MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM,
 		MPI_COMM_WORLD);

   /// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
   //  array from all the processors

   int num_send_neighbors = tmp_buffer[rank] % size;

   /// decode 'tmp_buffer[rank] to deduce total number of elements
   //  we must send

   int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;

   //
   // Check to see if we have enough workspace allocated.  This could be
   // dynamically modified, but let's keep it simple for now...
   //

   if (num_send_neighbors > max_num_messages)
     {
       cerr << "Must increase max_num_messages in HPC_Sparse_Matrix.hpp"
 	   <<endl;
       cerr << "Must be at least " <<  num_send_neighbors <<endl;
       abort();
     }

   if (total_to_be_sent > max_external )
     {
       cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
 	   <<endl;
       cerr << "Must be at least " << total_to_be_sent <<endl;
       abort();
     }
   delete [] tmp_neighbors;

   if (debug) {
     t0 = mytimer() - t0;
     cout << "           Time in finding neighbors phase = " << t0 << endl;
   }
   if (debug) cout << "Processor " << rank << " of " << size <<
 	       ": Number of send neighbors = " << num_send_neighbors << endl;

   if (debug) cout << "Processor " << rank << " of " << size <<
 	       ": Number of receive neighbors = " << num_recv_neighbors << endl;

   if (debug) cout << "Processor " << rank << " of " << size <<
 	       ": Total number of elements to send = " << total_to_be_sent << endl;

   if (debug) MPI_Barrier(MPI_COMM_WORLD);

   /////////////////////////////////////////////////////////////////////////
   ///
   // Make a list of the neighbors that will send information to update our
   // external elements (in the order that we will receive this information).
   ///
   /////////////////////////////////////////////////////////////////////////

   int * recv_list = new int[max_external];

   j = 0;
   recv_list[j++] = new_external_processor[0];
   for (i = 1; i < num_external; i++) {
     if (new_external_processor[i - 1] != new_external_processor[i]) {
       recv_list[j++] = new_external_processor[i];
     }
   }

   //
   // Send a 0 length message to each of our recv neighbors
   //

   int * send_list = new int[num_send_neighbors];
   for (i = 0 ; i < num_send_neighbors; i++ ) send_list[i] = 0;

   //
   //  first post receives, these are immediate receives
   //  Do not wait for result to come, will do that at the
   //  wait call below.
   //
   int MPI_MY_TAG = 99;

   MPI_Request * request = new MPI_Request[max_num_messages];
   for (i = 0; i < num_send_neighbors; i++)
     {
       MPI_Irecv(tmp_buffer+i, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MY_TAG,
 		MPI_COMM_WORLD, request+i);
     }

   // send messages

   for (i = 0; i < num_recv_neighbors; i++)
       MPI_Send(tmp_buffer+i, 1, MPI_INT, recv_list[i], MPI_MY_TAG,
 	       MPI_COMM_WORLD);
   ///
    // Receive message from each send neighbor to construct 'send_list'.
    ///

   MPI_Status status;
   for (i = 0; i < num_send_neighbors; i++)
     {
       if ( MPI_Wait(request+i, &status) )
 	{
 	  cerr << "MPI_Wait error\n"<<endl;
 	  exit(-1);
 	}
       send_list[i] = status.MPI_SOURCE;
   }


   /////////////////////////////////////////////////////////////////////////
   ///
   //  Compare the two lists. In most cases they should be the same.
   //  However, if they are not then add new entries to the recv list
   //  that are in the send list (but not already in the recv list).
   ///
   /////////////////////////////////////////////////////////////////////////

   for (j = 0; j < num_send_neighbors; j++)
     {
       int found = 0;
       for (i = 0; i < num_recv_neighbors; i++)
 	{
 	  if (recv_list[i] == send_list[j])
 	    found = 1;
 	}

       if (found == 0) {
 	if (debug) cout << "Processor " << rank << " of " << size <<
 		     ": recv_list[" << num_recv_neighbors <<"] = "
 			<< send_list[j] << endl;
 	recv_list[num_recv_neighbors] = send_list[j];
 	(num_recv_neighbors)++;

       }
     }

   delete [] send_list;
   num_send_neighbors = num_recv_neighbors;

   if (num_send_neighbors > max_num_messages)
     {
       cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
 	   <<endl;
       abort();
     }

   /////////////////////////////////////////////////////////////////////////
   /// Start filling HPC_Sparse_Matrix struct
   /////////////////////////////////////////////////////////////////////////

   A->total_to_be_sent = total_to_be_sent;
   int * elements_to_send = new int[total_to_be_sent];
   A->elements_to_send = elements_to_send;

   for (i = 0 ; i < total_to_be_sent; i++ ) elements_to_send[i] = 0;

   //
   // Create 'new_external' which explicitly put the external elements in the
   // order given by 'external_local_index'
   //

   int * new_external = new int[num_external];
   for (i = 0; i < num_external; i++) {
     new_external[external_local_index[i] - local_nrow] = external_index[i];
   }

   /////////////////////////////////////////////////////////////////////////
   //
   // Send each processor the global index list of the external elements in the
   // order that I will want to receive them when updating my external elements
   //
   /////////////////////////////////////////////////////////////////////////

   int * lengths = new int[num_recv_neighbors];

   MPI_MY_TAG++;

   // First post receives

   for (i = 0; i < num_recv_neighbors; i++)
     {
       int partner    = recv_list[i];
       MPI_Irecv(lengths+i, 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
 		request+i);
     }

   int * neighbors = new int[max_num_neighbors];
   int * recv_length = new int[max_num_neighbors];
   int * send_length = new int[max_num_neighbors];

   A->neighbors = neighbors;
   A->recv_length = recv_length;
   A->send_length = send_length;

   j = 0;
   for (i = 0; i < num_recv_neighbors; i++)
     {
       int start  = j;
       int newlength = 0;

       // go through list of external elements until updating
       // processor changes

       while ((j < num_external) &&
 	     (new_external_processor[j] == recv_list[i]))
 	{
 	  newlength++;
 	  j++;
 	  if (j == num_external) break;
 	}

       recv_length[i] = newlength;
       neighbors[i]  = recv_list[i];

       length = j - start;
       MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
     }

   // Complete the receives of the number of externals

   for (i = 0; i < num_recv_neighbors; i++)
     {
       if ( MPI_Wait(request+i, &status) )
 	{
 	  cerr << "MPI_Wait error\n"<<endl;
 	  exit(-1);
 	}
       send_length[i] = lengths[i];
     }
   delete [] lengths;


   ///////////////////////////////////////////////////////////////////
   // Build "elements_to_send" list.  These are the x elements I own
   // that need to be sent to other processors.
   ///////////////////////////////////////////////////////////////////

   MPI_MY_TAG++;

   j = 0;
   for (i = 0; i < num_recv_neighbors; i++)
     {
       MPI_Irecv(elements_to_send+j, send_length[i], MPI_INT, neighbors[i],
 		MPI_MY_TAG, MPI_COMM_WORLD, request+i);
       j += send_length[i];
     }

   j = 0;
   for (i = 0; i < num_recv_neighbors; i++)
     {
       int start  = j;
       int newlength = 0;

       // Go through list of external elements
       // until updating processor changes.  This is redundant, but
       // saves us from recording this information.

       while ((j < num_external) &&
 	     (new_external_processor[j] == recv_list[i])) {

 	newlength++;
 	j++;
 	if (j == num_external) break;
       }
       MPI_Send(new_external+start, j-start, MPI_INT, recv_list[i],
 	       MPI_MY_TAG, MPI_COMM_WORLD);
     }

   // receive from each neighbor the global index list of external elements

   for (i = 0; i < num_recv_neighbors; i++)
     {
       if ( MPI_Wait(request+i, &status) )
 	{
 	  cerr << "MPI_Wait error\n"<<endl;
 	  exit(-1);
 	}
   }

   /// replace global indices by local indices ///

   for (i = 0; i < total_to_be_sent; i++) elements_to_send[i] -= start_row;


   ////////////////
   // Finish up !!
   ////////////////

   A->num_send_neighbors = num_send_neighbors;
   A->local_ncol = A->local_nrow + num_external;

   //Used in exchange_externals
   double *send_buffer = new double[total_to_be_sent];
   A->send_buffer = send_buffer;

   delete [] tmp_buffer;
   delete [] global_index_offsets;
   delete [] recv_list;
   delete [] external_processor;
   delete [] new_external;
   delete [] new_external_processor;
   delete [] request;

   return;
 }
 #endif // USING_MPI

	//@HEADER
	// ************************************************************************
	//
	// HPCCG: Simple Conjugate Gradient Benchmark Code
	// Copyright (2006) Sandia Corporation
	//
	// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
	// license for use of this work by or on behalf of the U.S. Government.
	//
	// This library is free software; you can redistribute it and/or modify
	// it under the terms of the GNU Lesser General Public License as
	// published by the Free Software Foundation; either version 2.1 of the
	// License, or (at your option) any later version.
	//
	// This library is distributed in the hope that it will be useful, but
	// WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// Lesser General Public License for more details.
	//
	// You should have received a copy of the GNU Lesser General Public
	// License along with this library; if not, write to the Free Software
	// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	// USA
	// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
	//
	// ************************************************************************
	//@HEADER

	#ifdef USING_MPI // Compile this routine only if running in parallel
	#include <iostream>
	using std::cout;
	using std::cerr;
	using std::endl;
	#include <map>
	#include <algorithm>
	#include <cstdlib>
	#include <cstdio>
	#include <cassert>
	#include "HPC_Sparse_Matrix.hpp"
	#include "read_HPC_row.hpp"
	#include "make_local_matrix.hpp"
	#include "mytimer.hpp"
	//#define DEBUG
	void make_local_matrix(HPC_Sparse_Matrix * A)
	{
	std::map< int, int > externals;
	int i, j, k;
	int num_external = 0;
	double t0;

	int debug_details = 0; // Set to 1 for voluminous output
	#ifdef DEBUG
	int debug = 1;
	#else
	int debug = 0;
	#endif

	// Get MPI process info

	int size, rank; // Number of MPI processes, My process ID
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);


	// Extract Matrix pieces

	int start_row = A->start_row;
	int stop_row = A->stop_row;
	int total_nrow = A->total_nrow;
	long long total_nnz = A->total_nnz;
	int local_nrow = A->local_nrow;
	int local_nnz = A->local_nnz;
	int * nnz_in_row = A->nnz_in_row;
	double ** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
	int ** ptr_to_inds_in_row = A->ptr_to_inds_in_row;



	// We need to convert the index values for the rows on this processor
	// to a local index space. We need to:
	// - Determine if each index reaches to a local value or external value
	// - If local, subtract start_row from index value to get local index
	// - If external, find out if it is already accounted for.
	// - If so, then do nothing,
	// - otherwise
	// - add it to the list of external indices,
	// - find out which processor owns the value.
	// - Set up communication for sparse MV operation.


	///////////////////////////////////////////
	// Scan the indices and transform to local
	///////////////////////////////////////////

	if (debug) t0 = mytimer();

	int *external_index = new int[max_external];
	int *external_local_index = new int[max_external];
	A->external_index = external_index;
	A->external_local_index = external_local_index;

	for (i=0; i< local_nrow; i++)
	{
	for (j=0; j<nnz_in_row[i]; j++)
	{
	int cur_ind = ptr_to_inds_in_row[i][j];
	if (debug_details)
	cout << "Process "<<rank<<" of "<<size<<" getting index "
	<<cur_ind<<" in local row "<<i<<endl;
	if (start_row <= cur_ind && cur_ind <= stop_row)
	{
	ptr_to_inds_in_row[i][j] -= start_row;
	}
	else // Must find out if we have already set up this point
	{
	if (externals.find(cur_ind)==externals.end())
	{
	externals[cur_ind] = num_external++;
	if (num_external<=max_external)
	{
	external_index[num_external-1] = cur_ind;
	// Mark index as external by negating it
	ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
	}
	else
	{
	cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
	<<endl;
	abort();
	}
	}
	else
	{
	// Mark index as external by adding 1 and negating it
	ptr_to_inds_in_row[i][j] = - (ptr_to_inds_in_row[i][j] + 1);
	}
	}
	}
	}

	if (debug) {
	t0 = mytimer() - t0;
	cout << " Time in transform to local phase = " << t0 << endl;
	cout << "Processor " << rank << " of " << size <<
	": Number of external equations = " << num_external << endl;
	}

	////////////////////////////////////////////////////////////////////////////
	// Go through list of externals to find out which processors must be accessed.
	////////////////////////////////////////////////////////////////////////////

	if (debug) t0 = mytimer();

	A->num_external = num_external;
	int * tmp_buffer = new int[size]; // Temp buffer space needed below

	// Build list of global index offset

	int * global_index_offsets = new int[size];
	for (i=0;i<size; i++) tmp_buffer[i] = 0; // First zero out

	tmp_buffer[rank] = start_row; // This is my start row

	// This call sends the start_row of each ith processor to the ith
	// entry of global_index_offset on all processors.
	// Thus, each processor know the range of indices owned by all
	// other processors.
	// Note: There might be a better algorithm for doing this, but this
	// will work...

	MPI_Allreduce(tmp_buffer, global_index_offsets, size, MPI_INT,
	MPI_SUM, MPI_COMM_WORLD);

	// Go through list of externals and find the processor that owns each
	int * external_processor = new int[num_external];
	int * new_external_processor = new int[num_external];

	for (i=0; i< num_external; i++)
	{
	int cur_ind = external_index[i];
	for (int j=size-1; j>=0; j--)
	if (global_index_offsets[j] <= cur_ind)
	{
	external_processor[i] = j;
	break;
	}
	}
	if (debug) {
	t0 = mytimer() - t0;
	cout << " Time in finding processors phase = " << t0 << endl;
	}


	////////////////////////////////////////////////////////////////////////////
	// Sift through the external elements. For each newly encountered external
	// point assign it the next index in the sequence. Then look for other
	// external elements who are update by the same node and assign them the next
	// set of index numbers in the sequence (ie. elements updated by the same node
	// have consecutive indices).
	////////////////////////////////////////////////////////////////////////////

	if (debug) t0 = mytimer();

	int count = local_nrow;
	for (i = 0; i < num_external; i++) external_local_index[i] = -1;

	for (i = 0; i < num_external; i++) {
	if (external_local_index[i] == -1) {
	external_local_index[i] = count++;

	for (j = i + 1; j < num_external; j++) {
	if (external_processor[j] == external_processor[i])
	external_local_index[j] = count++;
	}
	}
	}

	if (debug) {
	t0 = mytimer() - t0;
	cout << " Time in scanning external indices phase = " << t0 << endl;
	}
	if (debug) t0 = mytimer();


	for (i=0; i< local_nrow; i++)
	{
	for (j=0; j<nnz_in_row[i]; j++)
	{
	if (ptr_to_inds_in_row[i][j]<0) // Change index values of externals
	{
	int cur_ind = - ptr_to_inds_in_row[i][j] - 1;
	ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
	}
	}
	}

	for (i = 0 ; i < num_external; i++) new_external_processor[i] = 0;

	for (i = 0; i < num_external; i++)
	new_external_processor[external_local_index[i] - local_nrow] =
	external_processor[i];

	if (debug) {
	t0 = mytimer() - t0;
	cout << " Time in assigning external indices phase = " << t0 << endl;
	}

	if (debug_details)
	{
	for (i = 0; i < num_external; i++)
	{
	cout << "Processor " << rank << " of " << size <<
	": external processor["<< i<< "] = " << external_processor[i]<< endl;
	cout << "Processor " << rank << " of " << size <<
	": new external processor["<< i<< "] = "
	<< new_external_processor[i]<< endl;
	}
	}

	////////////////////////////////////////////////////////////////////////////
	///
	// Count the number of neighbors from which we receive information to update
	// our external elements. Additionally, fill the array tmp_neighbors in the
	// following way:
	// tmp_neighbors[i] = 0 ==> No external elements are updated by
	// processor i.
	// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
	// processor i.
	///
	////////////////////////////////////////////////////////////////////////////

	t0 = mytimer();
	int * tmp_neighbors = new int[size];
	for (i = 0 ; i < size ; i++) tmp_neighbors[i] = 0;

	int num_recv_neighbors = 0;
	int length = 1;

	for (i = 0; i < num_external; i++)
	{
	if (tmp_neighbors[new_external_processor[i]] == 0)
	{
	num_recv_neighbors++;
	tmp_neighbors[new_external_processor[i]] = 1;
	}
	tmp_neighbors[new_external_processor[i]] += size;
	}

	/// sum over all processors all the tmp_neighbors arrays ///

	MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM,
	MPI_COMM_WORLD);

	/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
	// array from all the processors

	int num_send_neighbors = tmp_buffer[rank] % size;

	/// decode 'tmp_buffer[rank] to deduce total number of elements
	// we must send

	int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;

	//
	// Check to see if we have enough workspace allocated. This could be
	// dynamically modified, but let's keep it simple for now...
	//

	if (num_send_neighbors > max_num_messages)
	{
	cerr << "Must increase max_num_messages in HPC_Sparse_Matrix.hpp"
	<<endl;
	cerr << "Must be at least " << num_send_neighbors <<endl;
	abort();
	}

	if (total_to_be_sent > max_external )
	{
	cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
	<<endl;
	cerr << "Must be at least " << total_to_be_sent <<endl;
	abort();
	}
	delete [] tmp_neighbors;

	if (debug) {
	t0 = mytimer() - t0;
	cout << " Time in finding neighbors phase = " << t0 << endl;
	}
	if (debug) cout << "Processor " << rank << " of " << size <<
	": Number of send neighbors = " << num_send_neighbors << endl;

	if (debug) cout << "Processor " << rank << " of " << size <<
	": Number of receive neighbors = " << num_recv_neighbors << endl;

	if (debug) cout << "Processor " << rank << " of " << size <<
	": Total number of elements to send = " << total_to_be_sent << endl;

	if (debug) MPI_Barrier(MPI_COMM_WORLD);

	/////////////////////////////////////////////////////////////////////////
	///
	// Make a list of the neighbors that will send information to update our
	// external elements (in the order that we will receive this information).
	///
	/////////////////////////////////////////////////////////////////////////

	int * recv_list = new int[max_external];

	j = 0;
	recv_list[j++] = new_external_processor[0];
	for (i = 1; i < num_external; i++) {
	if (new_external_processor[i - 1] != new_external_processor[i]) {
	recv_list[j++] = new_external_processor[i];
	}
	}

	//
	// Send a 0 length message to each of our recv neighbors
	//

	int * send_list = new int[num_send_neighbors];
	for (i = 0 ; i < num_send_neighbors; i++ ) send_list[i] = 0;

	//
	// first post receives, these are immediate receives
	// Do not wait for result to come, will do that at the
	// wait call below.
	//
	int MPI_MY_TAG = 99;

	MPI_Request * request = new MPI_Request[max_num_messages];
	for (i = 0; i < num_send_neighbors; i++)
	{
	MPI_Irecv(tmp_buffer+i, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MY_TAG,
	MPI_COMM_WORLD, request+i);
	}

	// send messages

	for (i = 0; i < num_recv_neighbors; i++)
	MPI_Send(tmp_buffer+i, 1, MPI_INT, recv_list[i], MPI_MY_TAG,
	MPI_COMM_WORLD);
	///
	// Receive message from each send neighbor to construct 'send_list'.
	///

	MPI_Status status;
	for (i = 0; i < num_send_neighbors; i++)
	{
	if ( MPI_Wait(request+i, &status) )
	{
	cerr << "MPI_Wait error\n"<<endl;
	exit(-1);
	}
	send_list[i] = status.MPI_SOURCE;
	}



	/////////////////////////////////////////////////////////////////////////
	///
	// Compare the two lists. In most cases they should be the same.
	// However, if they are not then add new entries to the recv list
	// that are in the send list (but not already in the recv list).
	///
	/////////////////////////////////////////////////////////////////////////

	for (j = 0; j < num_send_neighbors; j++)
	{
	int found = 0;
	for (i = 0; i < num_recv_neighbors; i++)
	{
	if (recv_list[i] == send_list[j])
	found = 1;
	}

	if (found == 0) {
	if (debug) cout << "Processor " << rank << " of " << size <<
	": recv_list[" << num_recv_neighbors <<"] = "
	<< send_list[j] << endl;
	recv_list[num_recv_neighbors] = send_list[j];
	(num_recv_neighbors)++;

	}
	}

	delete [] send_list;
	num_send_neighbors = num_recv_neighbors;

	if (num_send_neighbors > max_num_messages)
	{
	cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
	<<endl;
	abort();
	}

	/////////////////////////////////////////////////////////////////////////
	/// Start filling HPC_Sparse_Matrix struct
	/////////////////////////////////////////////////////////////////////////

	A->total_to_be_sent = total_to_be_sent;
	int * elements_to_send = new int[total_to_be_sent];
	A->elements_to_send = elements_to_send;

	for (i = 0 ; i < total_to_be_sent; i++ ) elements_to_send[i] = 0;

	//
	// Create 'new_external' which explicitly put the external elements in the
	// order given by 'external_local_index'
	//

	int * new_external = new int[num_external];
	for (i = 0; i < num_external; i++) {
	new_external[external_local_index[i] - local_nrow] = external_index[i];
	}

	/////////////////////////////////////////////////////////////////////////
	//
	// Send each processor the global index list of the external elements in the
	// order that I will want to receive them when updating my external elements
	//
	/////////////////////////////////////////////////////////////////////////

	int * lengths = new int[num_recv_neighbors];

	MPI_MY_TAG++;

	// First post receives

	for (i = 0; i < num_recv_neighbors; i++)
	{
	int partner = recv_list[i];
	MPI_Irecv(lengths+i, 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
	request+i);
	}

	int * neighbors = new int[max_num_neighbors];
	int * recv_length = new int[max_num_neighbors];
	int * send_length = new int[max_num_neighbors];

	A->neighbors = neighbors;
	A->recv_length = recv_length;
	A->send_length = send_length;

	j = 0;
	for (i = 0; i < num_recv_neighbors; i++)
	{
	int start = j;
	int newlength = 0;

	// go through list of external elements until updating
	// processor changes

	while ((j < num_external) &&
	(new_external_processor[j] == recv_list[i]))
	{
	newlength++;
	j++;
	if (j == num_external) break;
	}

	recv_length[i] = newlength;
	neighbors[i] = recv_list[i];

	length = j - start;
	MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
	}

	// Complete the receives of the number of externals

	for (i = 0; i < num_recv_neighbors; i++)
	{
	if ( MPI_Wait(request+i, &status) )
	{
	cerr << "MPI_Wait error\n"<<endl;
	exit(-1);
	}
	send_length[i] = lengths[i];
	}
	delete [] lengths;


	///////////////////////////////////////////////////////////////////
	// Build "elements_to_send" list. These are the x elements I own
	// that need to be sent to other processors.
	///////////////////////////////////////////////////////////////////

	MPI_MY_TAG++;

	j = 0;
	for (i = 0; i < num_recv_neighbors; i++)
	{
	MPI_Irecv(elements_to_send+j, send_length[i], MPI_INT, neighbors[i],
	MPI_MY_TAG, MPI_COMM_WORLD, request+i);
	j += send_length[i];
	}

	j = 0;
	for (i = 0; i < num_recv_neighbors; i++)
	{
	int start = j;
	int newlength = 0;

	// Go through list of external elements
	// until updating processor changes. This is redundant, but
	// saves us from recording this information.

	while ((j < num_external) &&
	(new_external_processor[j] == recv_list[i])) {

	newlength++;
	j++;
	if (j == num_external) break;
	}
	MPI_Send(new_external+start, j-start, MPI_INT, recv_list[i],
	MPI_MY_TAG, MPI_COMM_WORLD);
	}

	// receive from each neighbor the global index list of external elements

	for (i = 0; i < num_recv_neighbors; i++)
	{
	if ( MPI_Wait(request+i, &status) )
	{
	cerr << "MPI_Wait error\n"<<endl;
	exit(-1);
	}
	}

	/// replace global indices by local indices ///

	for (i = 0; i < total_to_be_sent; i++) elements_to_send[i] -= start_row;


	////////////////
	// Finish up !!
	////////////////

	A->num_send_neighbors = num_send_neighbors;
	A->local_ncol = A->local_nrow + num_external;

	//Used in exchange_externals
	double *send_buffer = new double[total_to_be_sent];
	A->send_buffer = send_buffer;

	delete [] tmp_buffer;
	delete [] global_index_offsets;
	delete [] recv_list;
	delete [] external_processor;
	delete [] new_external;
	delete [] new_external_processor;
	delete [] request;

	return;
	}
	#endif // USING_MPI