//     *********************************************************************
//     Programming high-performance parallel computers using message passing
//     Example wave.cc
//     
//     Solve 2D advection equation with forcing term numerically with
//     the Leap-frog scheme. Problem size defined in wave.dat
//     
//     Advanced example: Virtual topologies, persistent communication,
//                       derived datatypes, processor binding
//
//     Author: Jarmo Rantakokko
//     Date:   2001-01-24
//
//     *********************************************************************

#include  <iostream.h>
#include  <fstream.h>
#include  "wave.h"

main(int argc, char **argv)
{

  // Initialize MPI and create a 2D-grid topology.
  int myid,nproc,reorder=1,ndim=2,dims[2]={0,0},periods[2]={0,0};
  int coords[2];
  const int root=0;
  MPI_Comm proc_grid;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD,&nproc);
  MPI_Dims_create(nproc,ndim,dims);
  MPI_Cart_create(MPI_COMM_WORLD,ndim,dims,periods,reorder,&proc_grid);
  MPI_Comm_rank(proc_grid,&myid);
  MPI_Cart_coords(proc_grid,myid,ndim,coords);

  // Variables
  int Nx,Ny,Nt,i,j,k,nnx,nny,rest,nx1,ny1;
  double x,y,dt,t,norm,dx,dy,t1,t2;
  
  // Problem size
  if (myid==root)
    {
      cout << "Computing... " << endl;
      ifstream file("wave.dat"); 
      file >> Nx; 
      file >> Ny;
      file.close();
    }
  MPI_Bcast(&Nx,1,MPI_INT,root,proc_grid);
  MPI_Bcast(&Ny,1,MPI_INT,root,proc_grid);
  dx=1.0/Nx; dy=1.0/Ny; dt=1.0/(Nx+Ny); Nt=1.0/dt;

  // Local arrays
  nnx=(Nx+1)/dims[0]; rest=(Nx+1)%dims[0];
  if (rest>coords[0]) 
    {
      nnx=nnx+1;
      nx1=nnx*coords[0]; 
    }
  else
    {
      nx1=rest*(nnx+1)+(coords[0]-rest)*nnx;
    }
  nny=(Ny+1)/dims[1]; rest=(Ny+1)%dims[1];
  if (rest>coords[1]) 
    {
      nny=nny+1;
      ny1=nny*coords[1]; 
    }
  else
    {
      ny1=rest*(nny+1)+(coords[1]-rest)*nny;
    }

  // Add ghost points (one on each side)
  Array2_dbl uold(nnx+2,nny+2),u(nnx+2,nny+2);
  Array2_dbl unew(nnx+2,nny+2),du(nnx+2,nny+2);
  Array2_dbl *u1=&uold;
  Array2_dbl *u2=&u;
  Array2_dbl *u3=&unew;
  Array2_dbl *temp;

  // Initiate persistent communication objects
  MPI_Request *reqarr1=new MPI_Request[8];
  MPI_Request *reqarr2=new MPI_Request[8];
  MPI_Request *reqarr3=new MPI_Request[8];
  MPI_Request *reqarrtemp;
  int n1,n2,n3;
  int *nreq1=&n1;
  int *nreq2=&n2;
  int *nreq3=&n3;
  int *nreqtemp;
  init_comm(uold,proc_grid,n1,reqarr1);
  init_comm(u,proc_grid,n2,reqarr2);
  init_comm(unew,proc_grid,n3,reqarr3);

  // Initial values
  t1=MPI_Wtime(); 
  for (i=1; i<=nnx; i++)
    for (j=1; j<=nny; j++)
      {
	x=double(i-1+nx1)*dx; y=double(j-1+ny1)*dy;
	u(i,j)=h(x+y)+up(x,y);
	unew(i,j)=h(x+y-2*dt)+up(x,y);
      }


  // Integrate numerically with Leap-frog
  for (k=2; k<=Nt; k++)
    {
      temp=u1; u1=u2; u2=u3; u3=temp;
      reqarrtemp=reqarr1; reqarr1=reqarr2; 
      reqarr2=reqarr3; reqarr3=reqarrtemp;
      nreqtemp=nreq1; nreq1=nreq2; nreq2=nreq3; nreq3=nreqtemp;
      diffop(du,*u2,Nx,Ny,*nreq2,reqarr2,coords,dims);
      for (i=1; i<=nnx; i++) 
	for (j=1; j<=nny; j++)
	  {
	    x=double(i-1+nx1)*dx; y=double(j-1+ny1)*dy;
	    (*u3)(i,j)=(*u1)(i,j)+2*dt*(F(x,y)-du(i,j));
	  }
      t=k*dt; 
      bound(*u3,*u1,t,dt,dx,dy,nx1,ny1,proc_grid);
    }

  // Residual
  norm=residual(*u3,t,dx,dy,nx1,ny1,proc_grid);
  t2=MPI_Wtime();


  if (myid==root)
    {
      cout << "-------------------------------" << endl;
      cout << "Problem size = " << Nx+1 << "x" << Ny+1 << endl;
      cout << "Error norm   = " << norm << endl;
      cout << "Wall time    = " << t2-t1 << " s" << endl;
      cout << "Processors   = " << dims[0] <<"x" << dims[1] << endl;
      cout << "-------------------------------" << endl;
    }

  // Exit MPI
  MPI_Finalize();
}