!==============================================================================
!
! Routines:
!
! (1) diagonalize()      Originally AC           Last Modified 7/8/2008 (JRD)
!
! generalised parallel hermitian eigenvalue solver,
! returns eigenvectors in shs and
! eigenvalues in egv. eigenvecs in zg see man pzheevx/pdsyevx,
! to save memory S(program) -> ZG(scala)
!                H(program) -> S(scala)
! -> means transformation to scala layout
! call pzheevx/pdsyevx
!                eigenvecs put in H(scala) -> ZG(program)
!
!        neig = number of eigenvectors
!        nmat = lda of shs and sis (=nv in code)
!
! interface routine for the pzheevx/pdsyevx, generalised parallel eigenvalue
! solver. Starts with the distributed S and H matrices. Redistributes
! them to blacs layout, then calls pzheevx/pdsyevx and
! then collects all the eigenvectors onto
! all pes. For more details of the scalapack routines and data layouts
! see http://www.netlib.org/scalapack/scalapack_home.html
!
! based on pssyevx_inter/pcheevx_inter, written by Andrew Canning
! NERSC/LBL 1998
!
!      distributed solver (block cyclic blacs layout):
!
!           nbl = blocksize
!           nprow = processor grid row
!           npcol = processor grid column
!
! double-precision eigenvalue solvers, pzheevx/pdsyevx
!
!===================================================================================

#include "f_defs.h"

subroutine diagonalize(pblock,neig,nmat,shs,egv,egs)

  use global_m
  use scalapack_m
  implicit none

  integer, intent(in) :: pblock
  integer, intent(in) :: neig
  integer, intent(in) :: nmat
  SCALAR, intent(inout) :: shs(nmat,peinf%nblocks*peinf%nblockd)
  real(DP), intent(out) :: egv(neig)
  SCALAR, intent(out) :: egs(nmat,pblock)

  character :: range
  character*100 :: tmpstr
  real(DP) :: abstol
  integer, allocatable :: iwork(:),ifail(:)
  integer :: nfound,ilow,iup,info,ii
  real(DP) :: ellow,elup
  real(DP), allocatable :: rwork(:),egv_t(:)
  SCALAR, allocatable :: work(:)
  real(DP) :: non_herm_max,non_herm_sum,non_herm

! Scalapack and blacs arrays

#ifdef USESCALAPACK
  type(scalapack) :: scal
  integer :: nrow_p(peinf%npes)
  integer :: desca(9)
  integer :: nbc,nbce,nbr,locsize,nn,nnp,np0,mq0
  integer :: clustersize,lwork,lrwork,liwork,nzfound
  integer :: idiff,ibc,ngr,ibr,ic,izero
  real(DP) :: dzero,orfac
  integer, allocatable :: iclustr(:)
  SCALAR, allocatable :: shs_bl(:),egs_bl(:)
  real(DP), allocatable :: gap(:)

  integer :: i,jj,inode,nq0,nod_g(nmat),icol(nmat)
  integer :: idum,kk,nbc_max,nbr_max, &
    numbl_max,iproc_n,num_max,ir_low, &
    ilen,icom,j,num_g_max,icoma(peinf%npes)

! index arrays for mpi  : H matrix

  integer, allocatable :: numr(:),nums(:)
  integer, allocatable :: indr(:,:),inds(:,:),iaddr(:,:), &
    iadds_r(:,:),iadds_c(:,:),iadds_d(:,:)

! index arrays for mpi : S matrix

  integer, allocatable :: numre(:),numse(:)
  integer, allocatable :: indre(:,:),indse(:,:),iaddre(:,:), &
    iaddse_r(:,:),iaddse_c(:,:),iaddse_d(:,:)
#endif  

  integer :: iloop,jloop

#ifdef USEESSL
  SCALAR, allocatable :: hupper(:)
  integer :: hup_counter
#endif

  PUSH_SUB(diagonalize)

! If neig > nmat, only the first neig eigenvectors/eigenvalues will
! be computed. Otherwise, calculate all of them.
! Set range

  if (neig.eq.nmat) then
    range='A'
  else if (neig.lt.nmat) then
    range='I'
  else
    write(tmpstr,'(a,i10,a,i10)') 'diagonalize: ', neig, ' eigenvalues requested > matrix size ', nmat
    call die(tmpstr)
  endif

! If running with single PE, use the serial subroutines. Otherwise,
! do the distributed job

  if (peinf%npes.eq.1) then
    if (nmat.ne.peinf%nblocks*peinf%nblockd) then
      call die('Hamiltonian matrix does not seem to be square!')
    endif
    abstol=0.0
    SAFE_ALLOCATE(ifail, (nmat))
    SAFE_ALLOCATE(work, (10*nmat))
    SAFE_ALLOCATE(iwork, (5*nmat))
    SAFE_ALLOCATE(egv_t, (nmat))
    ilow = 1
    iup = neig
#ifdef CPLX
    SAFE_ALLOCATE(rwork, (7*nmat))

    ! Note: we only do Hermiticity check in serial. parallel is complicated.
    non_herm_max = 0D0
    non_herm_sum = 0D0
    do jloop=1,nmat
      do iloop=1,jloop
        non_herm = abs(shs(iloop,jloop)-MYCONJG(shs(jloop,iloop)))
        non_herm_sum = non_herm_sum + non_herm
        if (non_herm .gt. non_herm_max) non_herm_max = non_herm     
        !shs(iloop,jloop) =  (shs(iloop,jloop)+MYCONJG(shs(jloop,iloop)))/2D0
        !shs(jloop,iloop) = MYCONJG(shs(iloop,jloop))
      enddo
    enddo
    if ( non_herm_max .gt. 1D-4 ) then
      write(0,*) ' '
      write(0,*) 'WARNING!! LARGE NON-HERMITICITY',non_herm_max,non_herm_sum
      write(0,*) ' '
    endif

#ifdef USEESSL
    SAFE_ALLOCATE(hupper, (nmat*(nmat+1)/2))
    hup_counter = 1
    do jloop=1,nmat
      do iloop=1,jloop
        hupper(hup_counter) =  shs(iloop,jloop)
        hup_counter = hup_counter + 1
      enddo
    enddo
    call zhpev(21,hupper,egv_t,egs,nmat,nmat,iwork,0)
    SAFE_DEALLOCATE(hupper)
#else
    call zheevx('V',range,'U',nmat,shs,nmat,ellow,elup,ilow,iup, &
      abstol,nfound,egv_t,egs,nmat,work,8*nmat,rwork,iwork,ifail,info)
    if(nfound .lt. neig) then
      write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with zheevx failed: only ', &
        nfound, ' of ', neig, ' eigenvalues found.'
      call die(tmpstr)
    endif
#endif
    SAFE_DEALLOCATE(rwork)

#else

#ifdef USEESSL
    SAFE_ALLOCATE(hupper, (nmat*(nmat+1)/2))
    hup_counter = 1
    do jloop=1,nmat
      do iloop=1,jloop
        hupper(hup_counter) =  shs(iloop,jloop)
        hup_counter = hup_counter + 1
      enddo
    enddo
    call dspev(21,hupper,egv_t,egs,nmat,nmat,iwork,0)
    SAFE_DEALLOCATE(hupper)
#else
    call dsyevx('V',range,'U',nmat,shs,nmat,ellow,elup,ilow,iup, &
      abstol,nfound,egv_t,egs,nmat,work,10*nmat,iwork,ifail,info)
    if(nfound .lt. neig) then
      write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with dsyevx failed: only ', &
        nfound, ' of ', neig, ' eigenvectors found.'
      call die(tmpstr)
    endif
#endif
#endif

    if(info.lt.0) then
      write(tmpstr,*) "Problem in input parameters for zheevx/dsyevx: info = ",info
      call die(tmpstr)
    endif

    if(info.gt.0) then
      write(0,*) "Convergence problems in zheevx/dsyevx: info = ",info
      write(tmpstr,*) 'The following eigenvector failed to converge: ifail = ',ifail
      call die(tmpstr)
    endif

    do ii=1,neig
      egv(ii) = egv_t(ii)
    enddo
    SAFE_DEALLOCATE(egv_t)
    SAFE_DEALLOCATE(iwork)
    SAFE_DEALLOCATE(work)
    SAFE_DEALLOCATE(ifail)
        
    POP_SUB(diagonalize)
    return
  endif

! The rest of this routine is only if the # of processors is not 1,
! which means we must be running MPI (the code below assumes it actually)

#if defined MPI && !defined USESCALAPACK
  #error ScaLAPACK is required for MPI builds.
#endif

#ifdef MPI

! Calculate num_g_max
! max number of g vecs per pe

  do i = 1,peinf%npes
    nrow_p(i) = peinf%nblockd*peinf%ibt(i)
  enddo

  idum = 0
  do i = 1,peinf%npes
    if(nrow_p(i).gt.idum) idum = nrow_p(i)
  enddo
  num_g_max = idum

! Calculate node index array for communications

  do ii = 1,peinf%npes
    idum = (ii-1)*peinf%nblockd
    do jj = 1,peinf%ibt(ii)
      do kk = 1,peinf%nblockd
        idum = idum + 1
        nod_g(idum) = ii-1
      enddo
      idum = idum + (peinf%npes-1)*peinf%nblockd
    enddo
  enddo

  do ii = 1,peinf%npes
    idum = (ii-1)*peinf%nblockd
    do jj = 1,peinf%ibt(ii)
      do kk = 1,peinf%nblockd
        idum = idum + 1
        icol(idum) = (jj-1)*peinf%nblockd + kk
      enddo
      idum = idum + (peinf%npes-1)*peinf%nblockd
    enddo
  enddo

  call mpi_comm_rank(mpi_comm_world,inode,mpierr)

! Choose scalapack layout. Block cyclic. Block size as
! close to 32 as possible for large matrices
! processor layout as close as possible to square

  call blacs_setup(scal, nmat, .false.)

! Figure out number of blocks per processor in the column/row
! and array sizes and maximums as well for allocations

  nbc = nmat/(scal%nbl*scal%npcol)
  if(mod(nmat,(scal%nbl*scal%npcol)).gt.scal%mypcol*scal%nbl) nbc=nbc+1
  nbr = nmat/(scal%nbl*scal%nprow)
  if(mod(nmat,(scal%nbl*scal%nprow)).gt.scal%myprow*scal%nbl) nbr=nbr+1
  call mpi_allreduce(nbc,nbc_max,1,mpi_integer,mpi_max, &
    mpi_comm_world,mpierr)
  call mpi_allreduce(nbr,nbr_max,1,mpi_integer,mpi_max, &
    mpi_comm_world,mpierr)
  numbl_max = nbc_max*nbr_max
  
  SAFE_ALLOCATE(egv_t, (nmat))
  SAFE_ALLOCATE(shs_bl, (numbl_max*scal%nbl*scal%nbl))
  shs_bl(:) = 0.0
  
  nbce = neig/(scal%nbl*scal%npcol)
  if(mod(neig,(scal%nbl*scal%npcol)).gt.scal%mypcol*scal%nbl) &
    nbce=nbce+1
  
  SAFE_ALLOCATE(egs_bl, (numbl_max*scal%nbl*scal%nbl))
  egs_bl(:) = 0.0
  
! Calculate work array sizes etc for pzheevx/pdsyevx
! assume no more than 20 eigenvalues in any one cluster
! if more pzheevx/pdsyevx will abort. If more then more memory is
! required for the orthogonalisation ie more work space
! amc  neig set to nv for work space calculation size
! seems to be a problem in pzheevx/pdsyevx with setting it to neig

  clustersize = 50
  dzero = 0.0
  izero = 0
  locsize = scal%nbl*scal%nbl*nbc*nbr
  nn = max(nmat,scal%nbl,2)
  nnp = max(nmat,scal%nprow*scal%npcol+1,4)
  np0 = numroc(nn,scal%nbl,0,0,scal%nprow)
  mq0 = numroc(max(neig,scal%nbl,2),scal%nbl,0,0,scal%npcol)

#ifdef CPLX
! Use nq0 for lwork to be consistent with pzheevx, man pages wrong

  nq0 = numroc(nn,scal%nbl,0,0,scal%npcol)
  lwork = nmat + (np0+nq0+scal%nbl)*scal%nbl
  
  lrwork=4*nmat+max(5*nn,np0*mq0)+iceil(neig,scal%nprow*scal%npcol)*nn + &
    (clustersize-1)*nmat

#else
  lwork=5*nmat+max(5*nn,np0*mq0+2*scal%nbl*scal%nbl)+ &
    iceil(neig,scal%nprow*scal%npcol)*nn+(clustersize-1)*nmat
  lrwork=0
#endif
  liwork = 6*nnp

! Calculate sizes for index arrays for mpi comms. : h matrix
! for s matrix, use the same num_max (requires little additional memory)

  SAFE_ALLOCATE(numr, (peinf%npes))

  numr = 0
  do ibc=0, nbc-1           ! loop over column blocks
    do ic=(ibc*scal%npcol+scal%mypcol)*scal%nbl+1, &
      min((ibc*scal%npcol+scal%mypcol)*scal%nbl+scal%nbl,nmat) ! loop over cols
      iproc_n = nod_g(ic)
      do ibr =0, nbr-1    ! loop over row blocks
        numr(iproc_n+1) = numr(iproc_n+1) + 1
      end do
    end do
  end do

! Get max of numr

  idum = 0
  do i = 1,peinf%npes
    if(numr(i).gt.idum) idum = numr(i)
  enddo

  call mpi_allreduce(idum,num_max,1,mpi_integer,mpi_max,mpi_comm_world,mpierr)

! Allocate index arrays for communications
! for remapping to scalapack layout

  SAFE_ALLOCATE(nums, (peinf%npes))
  SAFE_ALLOCATE(indr, (num_max,peinf%npes))
  SAFE_ALLOCATE(inds, (num_max,peinf%npes))
  SAFE_ALLOCATE(iaddr, (num_max,peinf%npes))
  SAFE_ALLOCATE(iadds_r, (num_max,peinf%npes))
  SAFE_ALLOCATE(iadds_c, (num_max,peinf%npes))
  SAFE_ALLOCATE(iadds_d, (num_max,peinf%npes))
  
  SAFE_ALLOCATE(numre, (peinf%npes))
  SAFE_ALLOCATE(numse, (peinf%npes))
  SAFE_ALLOCATE(indre, (num_max,peinf%npes))
  SAFE_ALLOCATE(indse, (num_max,peinf%npes))
  SAFE_ALLOCATE(iaddre, (num_max,peinf%npes))
  SAFE_ALLOCATE(iaddse_r, (num_max,peinf%npes))
  SAFE_ALLOCATE(iaddse_c, (num_max,peinf%npes))
  SAFE_ALLOCATE(iaddse_d, (num_max,peinf%npes))

! Allocate memory

  SAFE_ALLOCATE(work, (lwork))
#ifdef CPLX
  SAFE_ALLOCATE(rwork, (lrwork))
#endif
  SAFE_ALLOCATE(iwork, (liwork))
  
  SAFE_ALLOCATE(iclustr, (2*scal%nprow*scal%npcol))
  SAFE_ALLOCATE(gap, (scal%nprow*scal%npcol))
  SAFE_ALLOCATE(ifail, (nmat))
  
! Remap state distributed s and h to block-block blacs distribution
! in  zg and s
! calculate index matrices first for communications

  indr(:,:) = 0
  
  numr = 0
  
  idiff=0
  do ibc=0, nbc-1           ! loop over column blocks
    do ic=(ibc*scal%npcol+scal%mypcol)*scal%nbl+1, &
      min((ibc*scal%npcol+scal%mypcol)*scal%nbl+scal%nbl,nmat) ! loop over cols
      ngr = 0
      iproc_n = nod_g(ic)
      do ibr =0, nbr-1    ! loop over row blocks
        ir_low = (ibr*scal%nprow+scal%myprow)*scal%nbl+1
        ilen = min((ibr*scal%nprow+scal%myprow)*scal%nbl+scal%nbl,nmat)-ir_low+1
        ngr = ngr + ilen
        numr(iproc_n+1) = numr(iproc_n+1) + 1
        
        indr(numr(iproc_n+1),iproc_n+1) = ilen
        iaddr(numr(iproc_n+1),iproc_n+1) = idiff+1
        iadds_r(numr(iproc_n+1),iproc_n+1) = ir_low
        iadds_c(numr(iproc_n+1),iproc_n+1) = icol(ic)
        
        idiff = idiff + ilen
      end do
    end do
  end do

  indre(:,:) = 0
  
  numre = 0
  
  idiff=0
  do ibc=0, nbce-1           ! loop over column blocks
    do ic=(ibc*scal%npcol+scal%mypcol)*scal%nbl+1, &
      min((ibc*scal%npcol+scal%mypcol)*scal%nbl+scal%nbl,neig) ! loop over cols
      iproc_n = nod_g(ic)
      do ibr =0, nbr-1    ! loop over row blocks
        ir_low = (ibr*scal%nprow+scal%myprow)*scal%nbl+1
        ilen = min((ibr*scal%nprow+scal%myprow)*scal%nbl+scal%nbl,nmat)-ir_low+1
        numre(iproc_n+1) = numre(iproc_n+1) + 1
        
        indre(numre(iproc_n+1),iproc_n+1) = ilen
        iaddre(numre(iproc_n+1),iproc_n+1) = idiff+1
        iaddse_r(numre(iproc_n+1),iproc_n+1) = ir_low
        iaddse_c(numre(iproc_n+1),iproc_n+1) = icol(ic)
        
        idiff = idiff + ilen
      end do
    end do
  end do
  
! Communicate sending indexes etc to sending processors

  do i = 1,peinf%npes
    call mpi_isend(numr(i),1,mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(nums(i),1,mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(indr(1,i),numr(i),mpi_integer,i-1,inode, &
        mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(inds(1,i),nums(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(iadds_r(1,i),numr(i),mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(iadds_d(1,i),nums(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(iadds_c(1,i),numr(i),mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(iadds_r(1,i),nums(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_isend(numre(i),1,mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(numse(i),1,mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(indre(1,i),numre(i),mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(indse(1,i),numse(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(iaddse_r(1,i),numre(i),mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(iaddse_d(1,i),numse(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo
  call mpi_barrier(mpi_comm_world,mpierr)
  do i = 1,peinf%npes
    call mpi_isend(iaddse_c(1,i),numre(i),mpi_integer,i-1,inode, &
      mpi_comm_world,icom,mpierr)
  enddo
  do i = 1,peinf%npes
    call mpi_recv(iaddse_r(1,i),numse(i),mpi_integer,i-1,i-1, &
      mpi_comm_world,mpistatus,mpierr)
  enddo

! Allocate shmem dummy array hardwire to block size of 32 used in BLACS layout

!     call shpalloc(dum_p,32*peinf%npes,ierr,-1)

! Now communicate the data

  do j = 1,num_max
    
    call mpi_barrier(mpi_comm_world,mpierr)
    
    do i = 1,peinf%npes
      if(j.le.nums(i)) then
        call mpi_isend(shs(iadds_d(j,i),iadds_r(j,i)),inds(j,i), &
          MPI_SCALAR,i-1,nums(i)+j,mpi_comm_world,icoma(i),mpierr)
!          ix = iadds_d(j,i)-1
!          iy = iadds_r(j,i)
!          do ii = 1,inds(j,i)
!            dum(ii,i) = sis(ix+ii,iy)
!          enddo

      endif
    enddo

!        write(6,*) 'line 606', j, num_max

    call mpi_barrier(mpi_comm_world,mpierr)
    
    do i = 1,peinf%npes
      if(j.le.numr(i)) then
        call mpi_recv(shs_bl(iaddr(j,i)),indr(j,i), &
          MPI_SCALAR,i-1,numr(i)+j,mpi_comm_world,mpistatus,mpierr)
!          call shmem_get(zg(iaddr(j,i)),dum(1,inode+1),indr(j,i),i-1)
      endif
    enddo

    do i = 1,peinf%npes
      if(j.le.nums(i)) then
        call MPI_WAIT(icoma(i),mpistatus,mpierr)
      endif
    enddo
    
  enddo

  call mpi_barrier(mpi_comm_world,mpierr)

  call descinit(desca,nmat,nmat,scal%nbl,scal%nbl,0,0,scal%icntxt,ngr,info)
  
  abstol =  0.0
  orfac= 5d-7
  ilow=1
  iup=neig

  if (peinf%inode.eq.0) write(6,*) 'Beginning ScaLAPACK Diagonalization. Size: ', nmat
!  if (peinf%inode.eq.0) write(6,*) "range", range
!  if (peinf%inode.eq.0) write(6,*) "nmat", nmat
!  if (peinf%inode.eq.0) write(6,*) "shs_bl", shs_bl
!  if (peinf%inode.eq.0) write(6,*) "ellow", ellow
!  if (peinf%inode.eq.0) write(6,*) "ellup", elup
!  if (peinf%inode.eq.0) write(6,*) "ilow", ilow
!  if (peinf%inode.eq.0) write(6,*) "iup", iup
!  if (peinf%inode.eq.0) write(6,*) "abstol", abstol

#ifdef CPLX
  call pzheevx &
#else
  call pdsyevx &
#endif
    ('V',range,'U',nmat,shs_bl,1,1,desca, &
    ellow,elup,ilow,iup,abstol,nfound, &
    nzfound,egv_t,orfac,egs_bl,1,1,desca,work,lwork, &
#ifdef CPLX
    rwork,lrwork, &
#endif 
    iwork,liwork,ifail,iclustr,gap,info)

  if (peinf%inode.eq.0) write(6,*) 'Done Scalapack Diagonalization'
  if(info.lt.0) call die(" error in parameters for pzheevx/pdsyevx")
  
  if(info.gt.0) then
    write(0,*) " info = ",info
    if(inode.eq.0) then
      write(0,*) " iclustr = ",iclustr
      write(0,*) " gap = ",gap
    endif
    call die("Convergence or memory problems in pzheevx/pdsyevx.")
  endif

  if(nfound .lt. neig) then
    write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with pzheevx/pdsyevx failed: only ', &
      nfound, ' of ', neig, ' eigenvalues found.'
    call die(tmpstr)
  endif

  if(nzfound .lt. neig) then
    write(tmpstr,'(a, i10, a, i10, a)') 'Diagonalization with pzheevx/pdsyevx failed: only ', &
      nzfound, ' of ', neig, ' eigenvectors found.'
    call die(tmpstr)
  endif

! Copy eigenvector layout from scalapack layout back to program layout

  if (peinf%inode.eq.0) write(6,*) 'Sharing eigenvectors'

  do j = 1,num_max

    call mpi_barrier(mpi_comm_world,mpierr)
    
    do i = 1,peinf%npes
      if(j.le.numre(i)) then
        call mpi_isend(egs_bl(iaddre(j,i)),indre(j,i), &
          MPI_SCALAR,i-1,numre(i)+j,mpi_comm_world,icoma(i),mpierr)
!          ix = iaddre(j,i)-1
!          do ii = 1,indre(j,i)
!            dum(ii,i) = shs(ix+ii)
!          enddo

      endif
    enddo
        
    call mpi_barrier(mpi_comm_world,mpierr)
    
    do i = 1,peinf%npes
      if(j.le.numse(i)) then
        call mpi_recv(egs(iaddse_d(j,i),iaddse_r(j,i)),indse(j,i), &
          MPI_SCALAR,i-1,numse(i)+j,mpi_comm_world,mpistatus,mpierr)
!          call shmem_get(zg(iaddse_d(j,i),iaddse_r(j,i)),dum(1,inode+1),
!     >                   indse(j,i),i-1)
      endif
    enddo

    do i = 1,peinf%npes
      if(j.le.numre(i)) then
        call MPI_WAIT(icoma(i),mpistatus,mpierr)
      endif
    enddo
    
  enddo
      
  do ii=1,neig
    egv(ii) = egv_t(ii)
  enddo
  
  call mpi_barrier(mpi_comm_world,mpierr)
      
  call BLACS_GRIDEXIT(scal%icntxt)
  
  SAFE_DEALLOCATE(work)
#ifdef CPLX
  SAFE_DEALLOCATE(rwork)
#endif
  SAFE_DEALLOCATE(iwork)
  SAFE_DEALLOCATE(iclustr)
  SAFE_DEALLOCATE(gap)
  SAFE_DEALLOCATE(ifail)
  SAFE_DEALLOCATE(numr)
  SAFE_DEALLOCATE(nums)
  SAFE_DEALLOCATE(indr)
  SAFE_DEALLOCATE(inds)
  SAFE_DEALLOCATE(iaddr)
  SAFE_DEALLOCATE(iadds_r)
  SAFE_DEALLOCATE(iadds_c)
  SAFE_DEALLOCATE(iadds_d)
  SAFE_DEALLOCATE(numre)
  SAFE_DEALLOCATE(numse)
  SAFE_DEALLOCATE(indre)
  SAFE_DEALLOCATE(indse)
  SAFE_DEALLOCATE(iaddre)
  SAFE_DEALLOCATE(iaddse_r)
  SAFE_DEALLOCATE(iaddse_c)
  SAFE_DEALLOCATE(iaddse_d)
  SAFE_DEALLOCATE(egs_bl)
  SAFE_DEALLOCATE(shs_bl)
  SAFE_DEALLOCATE(egv_t)
  
  if (peinf%inode.eq.0) write(6,*) 'Diagonalization done'

!      call shpdeallc(dum_p,ierr,-1)

#endif

  POP_SUB(diagonalize)

  return
end subroutine diagonalize
