! @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ !
! @@                                                                @@ !
! @@       PROGRAM  ASCOT 2014.440 (ver.4.40)                       @@ !
! @@     "Abinitio Simulation Code for Quantum Transport"           @@ !
! @@                                                                @@ !
! @@                                                                @@ !
! @@  AUTHOR(S): Naoki WATANABE, Nobutaka NISHIKAWA (Mizuho I.R.)   @@ !
! @@             Hisashi KONDO (Univ. Tokyo)                        @@ !
! @@                                                09/May/2014     @@ !
! @@                                                                @@ !
! @@  Contact address: Phase System Consortium                      @@ !
! @@                                                                @@ !
! @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ !

module ac_mpi_module
  use ac_misc_module
  implicit none
  include 'mpif.h'

  type MPI_type
     integer :: size
     integer :: rank
     integer :: info
     logical :: root
     !!     integer :: DOUBLE_POSITION
     !!     integer :: SUM_POSITION

     integer :: npao
     integer :: isatom
     integer :: ieatom

     integer,pointer :: ispao(:)
     integer,pointer :: iepao(:)
     integer,pointer :: vnpao(:)

     integer,pointer :: vsmat(:)
     integer,pointer :: vemat(:)
     integer,pointer :: vnmat(:)

     integer         ::  smat
     integer         ::  emat
     integer         ::  nmat

     integer,pointer :: vmatcount(:)
     integer,pointer :: vmatdispl(:)

     integer :: SL_ID
     integer :: SL_DESC(9)
     integer :: SL_DESCLS(9)

     integer :: localsize1
     integer :: localsize2

     real(8) :: num_sc_work2space = -1.d0
     integer :: num_check_sc = 10000000
  end type MPI_type

  !!  type(MPI_type), public :: MPI
  type(MPI_type), public, save :: MPI

contains
  !!  subroutine MPI__Sum_Position( vin, vinout, size, dummy )
  !!    implicit none
  !!    integer, intent(in) :: size, dummy
  !!    real(8), intent(in) :: vin(size,3)
  !!    real(8), intent(inout) :: vinout(size,3)
  !!    integer :: i
  !!
  !!    do i=1, size
  !!       vinout(i,:) = vinout(i,:) + vin(i,:)
  !!    end do
  !!
  !!    return
  !!  end subroutine MPI__Sum_Position
  subroutine MPI__Initialize
    implicit none

    call MPI_Init( MPI%info )
    call MPI_Comm_rank( MPI_COMM_WORLD, MPI%rank, MPI%info )
    call MPI_Comm_size( MPI_COMM_WORLD, MPI%size, MPI%info )

    if( MPI%rank == 0 ) then
       MPI%root = .true.
    else
       MPI%root = .false.
    end if

    !!    call MPI_Type_Contiguous( &
    !!         3, MPI_DOUBLE_PRECISION, MPI%DOUBLE_POSITION, MPI%info )
    !!    call MPI_Type_Commit( MPI%DOUBLE_POSITION, MPI%info )
    !!    call MPI_Op_create( MPI__Sum_Position, .true., MPI%SUM_POSITION, MPI%info )

    call SL_Init( MPI%SL_ID, 1, MPI%size )

    return
  end subroutine MPI__Initialize

  subroutine MPI__Finalize
    implicit none

    call BLACS_GRIDEXIT( MPI%SL_ID )

    !!    call MPI_Op_Free  ( MPI%SUM_POSITION, MPI%info )
    !!    call MPI_Type_Free( MPI%DOUBLE_POSITION, MPI%info )
    call MPI_Finalize ( MPI%info )

    if( associated(MPI%vnmat) ) then
       deallocate( MPI%vnmat, MPI%vsmat, MPI%vemat )
    end if

    if( associated(MPI%ispao) ) then
       deallocate(MPI%ispao,MPI%iepao,MPI%vnpao)
    end if

    return
  end subroutine MPI__Finalize

  subroutine MPI__setup
    implicit none
    integer :: p, a, i, n

    if( Param%Data%npao < MPI%size ) then
       open(unit=16,file=Param%Option%file_ac_tempout,position='append')
       write(16,*) "      ++++++ warning! : too many processors for MPI"
       close(16)
    end if

    if( associated(MPI%vmatcount) ) then
       deallocate( MPI%vmatcount, MPI%vmatdispl )
    end if
    allocate( MPI%vmatcount(0:MPI%size-1), MPI%vmatdispl(0:MPI%size-1) )

    if( associated(MPI%vnmat) ) then
       deallocate( MPI%vnmat, MPI%vsmat, MPI%vemat )
    end if
    allocate( MPI%vnmat(0:MPI%size-1), MPI%vsmat(0:MPI%size-1), MPI%vemat(0:MPI%size-1) )

    if( associated(MPI%vnpao) ) then
       deallocate(MPI%vnpao, MPI%ispao, MPI%iepao)
    end if
    allocate( MPI%vnpao(0:MPI%size-1), MPI%ispao(Param%Data%natom), MPI%iepao(Param%Data%natom) )

    MPI%localsize1 = int(ceiling(dble(Param%Data%npao) / dble(MPI%size)))
    MPI%localsize2 = 2*int(ceiling(dble(Param%Data%npao) / dble(MPI%size)))

    if( MPI%localsize1*(MPI%size-1) >= Param%Data%npao ) then
       open(unit=16,file=Param%Option%file_ac_tempout,position='append')
       write(16,999)  MPI%size
999    format("      ++++++ Sorry! : This program can not be parallelized by processors of ",i5)
       close(16)
       call MPI__Finalize
       stop
    end if

    call DescInit( MPI%SL_DESC, Param%Data%npao, Param%Data%npao, &
         MPI%localsize1, MPI%localsize1, 0, 0, MPI%SL_ID, Param%Data%npao, MPI%info )
    call DescInit( MPI%SL_DESCLS, 2*Param%Data%npao, 2*Param%Data%npao, &
         MPI%localsize2, MPI%localsize2, 0, 0, MPI%SL_ID, 2*Param%Data%npao, MPI%info )

    n=0
    do p=0, MPI%size-1
       if( n + MPI%localsize1 > Param%Data%npao ) then
          MPI%vnpao(p) = Param%Data%npao - n
       else
          MPI%vnpao(p) = MPI%localsize1
       end if
       n = n + MPI%vnpao(p)
    end do
    MPI%npao = MPI%vnpao(MPI%rank)

    i=0
    do p=0, MPI%size-1
       MPI%vsmat(p) = i+1
       MPI%vemat(p) = MPI%vsmat(p) + MPI%vnpao(p) - 1
       MPI%vnmat(p) = MPI%vnpao(p)
       i = i + MPI%vnpao(p)
    end do
    MPI%smat = MPI%vsmat(MPI%rank)
    MPI%emat = MPI%vemat(MPI%rank)
    MPI%nmat = MPI%vnmat(MPI%rank)

    do p=0, MPI%size-1
       MPI%vmatcount(p) = MPI%vnmat(p) * MPI%nmat
       !!       MPI%vmatdispl(p) = MPI%vsmat(p) * MPI%nmat
       MPI%vmatdispl(p) = MPI%nmat * MPI%vsmat(p) - MPI%nmat * MPI%vsmat(0)
    end do

    do a=1, Param%Data%natom
       MPI%ispao(a) = 0+1
       MPI%iepao(a) = Param%Data%vnpao(a)-1+1
    end do

    n=0
    do a=1, Param%Data%natom
       n=n+Param%Data%vnpao(a)
       if( MPI%vsmat(MPI%rank)-1 < n ) then
          MPI%isatom = a
          MPI%ispao(a) = (MPI%vsmat(MPI%rank)-1) - (Param%Data%vipao(a)-1) + 1
          exit
       end if
    end do

    n=0
    do a=1, Param%Data%natom
       n=n+Param%Data%vnpao(a)
       if( MPI%vemat(MPI%rank)-1 < n ) then
          MPI%ieatom = a
          MPI%iepao(a) = (MPI%vemat(MPI%rank)-1) - (Param%Data%vipao(a)-1) + 1
          exit
       end if
    end do

    if( MPI%num_sc_work2space >= -1.d0 ) then
       if( Param%Option%spin_orbit ) then
          p=(2*Base%npao/sqrt(dfloat(1*MPI%size))-1.d0)*2*Base%npao
       else
          p=(Base%npao/sqrt(dfloat(1*MPI%size))-1.d0)*Base%npao
       end if
       if( MPI%num_sc_work2space > 0.d0 ) then
          if( MPI%num_sc_work2space > dfloat(p) ) then
             MPI%num_sc_work2space = dfloat(p)
          end if
       else
          MPI%num_sc_work2space = -dfloat(p) * MPI%num_sc_work2space
          if( MPI%num_sc_work2space > dfloat(p) ) then
             MPI%num_sc_work2space = dfloat(p)
          end if
       end if
    else
       MPI%num_sc_work2space=0.d0
    end if
    return
  end subroutine MPI__setup

  subroutine MPI__Bcast_Inputfile( filename )
    implicit none
    character(64), intent(inout) :: filename

    if( MPI%size == 1 ) return

    call MPI_Bcast( filename, 64, MPI_CHARACTER, 0, MPI_COMM_WORLD, MPI%info )

    return
  end subroutine MPI__Bcast_Inputfile

  subroutine MPI__Allreduce_DensityLS( densityLS )
    implicit none
    complex(8), intent(inout) :: densityLS(Param%Option%nspin,Param%Cell%Na,Param%Cell%Nb,Param%Cell%Nc)
    complex(8), allocatable :: densityLS_mpi(:,:,:,:) 

    if( MPI%size == 1 ) return

    allocate(densityLS_mpi(Param%Option%nspin,Param%Cell%Na,Param%Cell%Nb,Param%Cell%Nc))
    densityLS_mpi = densityLS

    call MPI_Allreduce( &
         densityLS_mpi, densityLS, &
         Param%Option%nspin * Param%Cell%Na * Param%Cell%Nb * Param%Cell%Nc, &
         MPI_DOUBLE_COMPLEX, &
         MPI_SUM, &
         MPI_COMM_WORLD, MPI%info )

    deallocate( densityLS_mpi )

    return
  end subroutine MPI__Allreduce_DensityLS

  subroutine MPI__Allreduce_Density( density )
    implicit none
    real(8), intent(inout) :: density (Param%Option%nspin,Param%Cell%Na,Param%Cell%Nb,Param%Cell%Nc)
    real(8), allocatable :: density_mpi(:,:,:,:) 

    if( MPI%size == 1 ) return

    allocate(density_mpi(Param%Option%nspin,Param%Cell%Na,Param%Cell%Nb,Param%Cell%Nc))
    density_mpi = density

    call MPI_Allreduce( &
         density_mpi, density, &
         Param%Option%nspin * Param%Cell%Na * Param%Cell%Nb * Param%Cell%Nc, &
         MPI_DOUBLE_PRECISION, &
         MPI_SUM, &
         MPI_COMM_WORLD, MPI%info )

    deallocate( density_mpi )

    return
  end subroutine MPI__Allreduce_Density

  subroutine MPI__AlltoAll_Matrix( matrix )
    implicit none
    complex(8), intent(inout) :: matrix(Param%Data%npao,MPI%smat:MPI%emat)
    complex(8), allocatable :: pack_send(:)
    complex(8), allocatable :: pack_recv(:)
    integer :: i, j, p, index

    if( MPI%size == 1 ) then
       do i=1, Param%Data%npao
          do j=i, Param%Data%npao
             matrix(i,j) = dconjg(matrix(i,j)) + matrix(j,i)
             matrix(j,i) = dconjg(matrix(i,j))
          end do
       end do

       return
    end if

    allocate(pack_send(MPI%nmat*Param%Data%npao))
    allocate(pack_recv(MPI%nmat*Param%Data%npao))

    index=0
    do p=0, MPI%size-1
       do i=MPI%smat, MPI%emat
          do j=MPI%vsmat(p), MPI%vemat(p)
             index=index+1
             pack_send(index) = matrix(j,i)
          end do
       end do
    end do

    call MPI_AlltoAllv( &
         pack_send, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         pack_recv, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         MPI_COMM_WORLD, MPI%info )

    index=0
    do p=0, MPI%size-1
       do j=MPI%vsmat(p), MPI%vemat(p)
          do i=MPI%smat, MPI%emat
             index=index+1
             matrix(j,i) = dconjg(matrix(j,i)) + pack_recv(index)
          end do
       end do
    end do

    deallocate( pack_send, pack_recv )

    return
  end subroutine MPI__AlltoAll_Matrix

  subroutine MPI__AlltoAll_Matrix2( matrix2, matrix3 )
    implicit none
    complex(8), intent(inout) :: matrix2(Param%Data%npao,MPI%smat:MPI%emat)
    complex(8), intent(inout) :: matrix3(Param%Data%npao,MPI%smat:MPI%emat)
    complex(8), allocatable :: pack_send(:)
    complex(8), allocatable :: pack_recv(:)
    integer :: i, j, p, index

    if( MPI%size == 1 ) then
       do i=1, Param%Data%npao
          do j=1, Param%Data%npao
             matrix2(i,j) = dconjg(matrix2(i,j)) + matrix3(j,i)
             matrix3(j,i) = dconjg(matrix2(i,j))
          end do
       end do

       return
    end if

    allocate(pack_send(MPI%nmat*Param%Data%npao))
    allocate(pack_recv(MPI%nmat*Param%Data%npao))

    index=0
    do p=0, MPI%size-1
       do i=MPI%smat, MPI%emat
          do j=MPI%vsmat(p), MPI%vemat(p)
             index=index+1
             pack_send(index) = matrix3(j,i)
          end do
       end do
    end do

    call MPI_AlltoAllv( &
         pack_send, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         pack_recv, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         MPI_COMM_WORLD, MPI%info )

    index=0
    do p=0, MPI%size-1
       do i=MPI%smat, MPI%emat
          do j=MPI%vsmat(p), MPI%vemat(p)
             index=index+1
             pack_send(index) = matrix2(j,i)
          end do
       end do
    end do

    index=0
    do p=0, MPI%size-1
       do j=MPI%vsmat(p), MPI%vemat(p)
          do i=MPI%smat, MPI%emat
             index=index+1
             matrix2(j,i) = dconjg(matrix2(j,i)) + pack_recv(index)
          end do
       end do
    end do

    call MPI_AlltoAllv( &
         pack_send, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         pack_recv, MPI%vmatcount, MPI%vmatdispl, MPI_DOUBLE_COMPLEX, &
         MPI_COMM_WORLD, MPI%info )


    index=0
    do p=0, MPI%size-1
       do j=MPI%vsmat(p), MPI%vemat(p)
          do i=MPI%smat, MPI%emat
             index=index+1
             matrix3(j,i) = dconjg(matrix3(j,i)) + pack_recv(index)
          end do
       end do
    end do

    deallocate( pack_send, pack_recv )

    return
  end subroutine MPI__AlltoAll_Matrix2

  subroutine MPI__Allreduce_Force( vatom )
    implicit none
    type(Atom_type), intent(inout) :: vatom(Param%Data%natom)
    real(8), allocatable :: force_mpi(:,:), force_sum(:,:)
    integer :: a

    if( MPI%size == 1 ) return

    allocate( force_mpi(Param%Data%natom,3), force_sum(Param%Data%natom,3) )
    do a=1, Param%Data%natom
       force_mpi(a,:) = vatom(a)%force
    end do

    call MPI_Allreduce( &
         force_mpi, force_sum, 3*Param%Data%natom, MPI_DOUBLE_PRECISION, &
         MPI_SUM, MPI_COMM_WORLD, MPI%info )

    do a=1, Param%Data%natom
       vatom(a)%force = force_sum(a,:)
    end do

    deallocate( force_mpi, force_sum )

    return
  end subroutine MPI__Allreduce_Force

  subroutine MPI__Allreduce_HistoryPulay( A )
    implicit none
    real(8), intent(inout) :: A(Param%SCF%mix_history,Param%SCF%mix_history)
    real(8), allocatable :: A_mpi(:,:) 

    if( MPI%size == 1 ) return

    allocate( A_mpi(Param%SCF%mix_history,Param%SCF%mix_history) )
    A_mpi = A

    call MPI_Allreduce( &
         A_mpi, A, Param%SCF%mix_history * Param%SCF%mix_history, &
         MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, MPI%info )

    deallocate( A_mpi )

    return
  end subroutine MPI__Allreduce_HistoryPulay

  subroutine MPI__Allreduce_HistoryAnderson( A )
    implicit none
    real(8), intent(inout) :: A(2:3,3)
    real(8) :: A_mpi(2:3,3) 

    if( MPI%size == 1 ) return

    A_mpi = A

    call MPI_Allreduce( &
         A_mpi, A, 2*3, &
         MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, MPI%info )

    return
  end subroutine MPI__Allreduce_HistoryAnderson

  subroutine MPI__Allreduce_Error( dEden )
    implicit none
    real(8), intent(inout) :: dEden
    real(8) :: dEden_mpi

    if( MPI%size == 1 ) return

    dEden_mpi = dEden

    call MPI_Allreduce( &
         dEden_mpi, dEden, 1, &
         MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, MPI%info )

    return
  end subroutine MPI__Allreduce_Error

  subroutine MPI__Allreduce_Error_Max( dEden )
    implicit none
    real(8), intent(inout) :: dEden
    real(8) :: dEden_mpi

    if( MPI%size == 1 ) return

    dEden_mpi = dEden

    call MPI_Allreduce( &
         dEden_mpi, dEden, 1, &
         MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, MPI%info )

    return
  end subroutine MPI__Allreduce_Error_Max

  subroutine MPI__Barrier
    implicit none

    if( MPI%size == 1 ) return
    call MPI_Barrier(MPI_COMM_WORLD, MPI%info)

    return
  end subroutine MPI__Barrier

  subroutine MPI__ZHEGV( size, matrixH, matrixS, vectorE, iter )
    implicit none

    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixH(size,MPI%smat:MPI%emat)
    complex(8), intent(in)    :: matrixS(size,MPI%smat:MPI%emat)
    real(8),    intent(out)   :: vectorE(size)

    integer, intent(in) :: iter

    real(8) :: VL, VU
    integer :: IL, IU
    integer :: M,  NZ
    real(8), parameter :: ABSTOL = -1.0d0
    real(8), parameter :: ORFAC  = -1.0d0
    integer, allocatable :: icluster(:)
    real(8), allocatable :: gap(:)
    integer, allocatable :: ifail(:)

    complex(8), allocatable :: tempH(:,:)
    complex(8), allocatable :: tempS(:,:)
    complex(8), allocatable :: tempQ(:,:)
    complex(8), allocatable :: work1(:)
    real(8),    allocatable :: work2(:)
    integer,    allocatable :: work3(:)
    integer                 :: lwork1
    integer                 :: lwork2
    integer                 :: lwork3

    integer :: i, j

    allocate( tempH(size,MPI%smat:MPI%smat+MPI%localsize1-1) )
    allocate( tempS(size,MPI%smat:MPI%smat+MPI%localsize1-1) )
    allocate( tempQ(size,MPI%smat:MPI%smat+MPI%localsize1-1) )
    tempH(:,:)=0.d0
    tempS(:,:)=0.d0
    tempQ(:,:)=0.d0
    do i=MPI%smat, MPI%emat
       do j=1, size
          tempH(j,i) = matrixH(j,i)
          tempS(j,i) = matrixS(j,i)
       end do
    end do

    allocate( icluster(2*MPI%size), gap(MPI%size), ifail(size) )

    allocate( work1(1), work2(3), work3(1) )
    lwork1 = -1
    lwork2 = -1
    lwork3 = -1

    call PZHEGVX( 1, 'V', 'A', 'L', size, &
         tempH, 1, 1, MPI%SL_DESC, &
         tempS, 1, 1, MPI%SL_DESC, &
         VL, VU, IL, IU, ABSTOL, M, NZ, &
         vectorE, ORFAC, &
         tempQ, 1, 1, MPI%SL_DESC, &
         work1, lwork1, work2, lwork2, work3, lwork3, &
         ifail, icluster, gap, MPI%info )

    lwork1 = int(work1(1)) 
    lwork2 = int(work2(1)) + int(MPI%num_sc_work2space)
    lwork3 = int(work3(1)) 

    deallocate( work1, work2, work3 )
    allocate( work1(lwork1), work2(lwork2), work3(lwork3) )

    call PZHEGVX( 1, 'V', 'A', 'L', size, &
         tempH, 1, 1, MPI%SL_DESC, &
         tempS, 1, 1, MPI%SL_DESC, &
         VL, VU, IL, IU, ABSTOL, M, NZ, &
         vectorE, ORFAC, &
         tempQ, 1, 1, MPI%SL_DESC, &
         work1, lwork1, work2, lwork2, work3, lwork3, &
         ifail, icluster, gap, MPI%info )

    do i=MPI%smat, MPI%emat
       do j=1, size
          matrixH(j,i)=tempQ(j,i)
       end do
    end do

    deallocate( work1, work2, work3 )
    deallocate( icluster, gap, ifail )

    if( mod(iter,MPI%num_check_sc) == 0 .or. iter == 1 ) then
       if( iter /= 0 ) then
          open(unit=16,file=Param%Option%file_ac_tempout,position='append')
          write(16,*) '                               +++++ check: ScaLapack(PZHEGVX)'
          close(16)
       end if
       allocate( work1(2) )
       work1(1)=dcmplx(1.0d0,0.d0)
       work1(2)=dcmplx(0.0d0,0.d0)

       do i=MPI%smat, MPI%emat
          do j=1, size
             tempH(j,i)=matrixS(j,i)
          end do
       end do
       call MPI__ZGEMM('C', 'N', size, work1(1), tempQ, tempH, work1(2), tempS )
       call MPI__ZGEMM('N', 'N', size, work1(1), tempS, tempQ, work1(2), tempH )
       do i=MPI%smat, MPI%emat
          do j=1, size
             if( i == j ) then
                if( cdabs(tempH(j,i)-1.d0) > 1.d-10 ) then
                   open(unit=16,file=Param%Option%file_ac_tempout,position='append')
                   write(16,999) i,j,tempH(j,i)
                   write(16,998)
                   close(16)
                   stop
                end if
             else
                if( cdabs(tempH(j,i)) > 1.d-10 ) then
                   open(unit=16,file=Param%Option%file_ac_tempout,position='append')
                   write(16,999) i,j,tempH(j,i)
                   write(16,998)
                   close(16)
                   stop
                end if
             end if
          end do
       end do
999    format('error: MPI__ZHEGV',2i5,2e20.10,'  -> stop')
998    format('       ====> increase: //mpi_condition/num_sc_work2space//')

       deallocate( work1 )
    end if

    deallocate( tempH, tempS, tempQ )

    return
  end subroutine MPI__ZHEGV

  subroutine MPI__ZHEGVLS( size, matrixH, matrixS, vectorE, iter )
    implicit none

    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixH(2*size,2*MPI%smat-1:2*MPI%emat-0)
    complex(8), intent(in)    :: matrixS(2*size,2*MPI%smat-1:2*MPI%emat-0)
    real(8),    intent(out)   :: vectorE(2*size)

    integer, intent(in) :: iter

    real(8) :: VL, VU
    integer :: IL, IU
    integer :: M,  NZ
    real(8), parameter :: ABSTOL = -1.0d0
    real(8), parameter :: ORFAC  = -1.0d0
    integer, allocatable :: icluster(:)
    real(8), allocatable :: gap(:)
    integer, allocatable :: ifail(:)

    complex(8), allocatable :: tempH(:,:)
    complex(8), allocatable :: tempS(:,:)
    complex(8), allocatable :: tempQ(:,:)
    complex(8), allocatable :: work1(:)
    real(8),    allocatable :: work2(:)
    integer,    allocatable :: work3(:)
    integer                 :: lwork1
    integer                 :: lwork2
    integer                 :: lwork3

    integer :: i,j

    allocate( tempH(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1) )
    allocate( tempS(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1) )
    allocate( tempQ(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1) )
    tempH(:,:)=0.d0
    tempS(:,:)=0.d0
    tempQ(:,:)=0.d0
    do i=2*MPI%smat-1,2*MPI%emat-0
       do j=1, 2*size
          tempH(j,i) = matrixH(j,i)
          tempS(j,i) = matrixS(j,i)
       end do
    end do

    allocate( icluster(2*MPI%size), gap(MPI%size), ifail(2*size) )

    allocate( work1(1), work2(3), work3(1) )
    lwork1 = -1
    lwork2 = -1
    lwork3 = -1

    call PZHEGVX( 1, 'V', 'A', 'L', 2*size, &
         tempH, 1, 1, MPI%SL_DESCLS, &
         tempS, 1, 1, MPI%SL_DESCLS, &
         VL, VU, IL, IU, ABSTOL, M, NZ, &
         vectorE, ORFAC, &
         tempQ, 1, 1, MPI%SL_DESCLS, &
         work1, lwork1, work2, lwork2, work3, lwork3, &
         ifail, icluster, gap, MPI%info )

    lwork1 = int(work1(1)) 
    lwork2 = int(work2(1)) + int(MPI%num_sc_work2space)
    lwork3 = int(work3(1)) 

    deallocate( work1, work2, work3 )
    allocate( work1(lwork1), work2(lwork2), work3(lwork3) )

    call PZHEGVX( 1, 'V', 'A', 'L', 2*size, &
         tempH, 1, 1, MPI%SL_DESCLS, &
         tempS, 1, 1, MPI%SL_DESCLS, &
         VL, VU, IL, IU, ABSTOL, M, NZ, &
         vectorE, ORFAC, &
         tempQ, 1, 1, MPI%SL_DESCLS, &
         work1, lwork1, work2, lwork2, work3, lwork3, &
         ifail, icluster, gap, MPI%info )

    do i=2*MPI%smat-1,2*MPI%emat-0
       do j=1, 2*size
          matrixH(j,i)=tempQ(j,i)
       end do
    end do

    deallocate( work1, work2, work3 )
    deallocate( icluster, gap, ifail )

    if( mod(iter,MPI%num_check_sc) == 0 .or. iter == 1 ) then
       if( iter /= 0 ) then
          open(unit=16,file=Param%Option%file_ac_tempout,position='append')
          write(16,*) '                               +++++ check: ScaLapack(PZHEGVX)'
          close(16)
       end if
       allocate( work1(2) )
       work1(1)=dcmplx(1.0d0,0.d0)
       work1(2)=dcmplx(0.0d0,0.d0)

       do i=2*MPI%smat-1,2*MPI%emat-0
          do j=1, 2*size
             tempH(j,i)=matrixS(j,i)
          end do
       end do
       call MPI__ZGEMMLS('C', 'N', size, work1(1), tempQ, tempH, work1(2), tempS )
       call MPI__ZGEMMLS('N', 'N', size, work1(1), tempS, tempQ, work1(2), tempH )
       do i=2*MPI%smat-1,2*MPI%emat-0
          do j=1, 2*size
             if( i == j ) then
                if( cdabs(tempH(j,i)-1.d0) > 1.d-10 ) then
                   open(unit=16,file=Param%Option%file_ac_tempout,position='append')
                   write(16,999) i,j,tempH(j,i)
                   write(16,998)
                   close(16)
                   stop
                end if
             else
                if( cdabs(tempH(j,i)) > 1.d-10 ) then
                   open(unit=16,file=Param%Option%file_ac_tempout,position='append')
                   write(16,999) i,j,tempH(j,i)
                   write(16,998)
                   close(16)
                   stop
                end if
             end if
          end do
       end do
999    format('error: MPI__ZHEGV',2i5,2e20.10,'  -> stop')
998    format('       ====> increase: //mpi_condition/num_sc_work2space//')

       deallocate( work1 )
    end if

    deallocate( tempH, tempS, tempQ )

    return
  end subroutine MPI__ZHEGVLS

  subroutine MPI__ZGEMM( transa, transb, size, &
       alpha, matrixA, matrixB, beta, matrixC )
    implicit none

    character, intent(in)     :: transa, transb
    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixA(size,MPI%smat:MPI%smat+MPI%localsize1-1)
    complex(8), intent(inout) :: matrixB(size,MPI%smat:MPI%smat+MPI%localsize1-1)
    complex(8), intent(inout) :: matrixC(size,MPI%smat:MPI%smat+MPI%localsize1-1)
    complex(8), intent(in)    :: alpha, beta

    call PZGEMM( transa, transb, size, size, size, &
         alpha, matrixA, 1, 1, MPI%SL_DESC, matrixB, 1, 1, MPI%SL_DESC, &
         beta,  matrixC, 1, 1, MPI%SL_DESC )

    return
  end subroutine MPI__ZGEMM

  subroutine MPI__ZGEMMLS( transa, transb, size, &
       alpha, matrixA, matrixB, beta, matrixC )
    implicit none

    character, intent(in)     :: transa, transb
    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixA(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1)
    complex(8), intent(inout) :: matrixB(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1)
    complex(8), intent(inout) :: matrixC(2*size,2*MPI%smat-1:2*MPI%smat-1+MPI%localsize2-1)
    complex(8), intent(in)    :: alpha, beta

    call PZGEMM( transa, transb, 2*size, 2*size, 2*size, &
         alpha, matrixA, 1, 1, MPI%SL_DESCLS, matrixB, 1, 1, MPI%SL_DESCLS, &
         beta,  matrixC, 1, 1, MPI%SL_DESCLS )

    return
  end subroutine MPI__ZGEMMLS

  subroutine MPI__ZHEMM( size, &
       alpha, matrixA, matrixB, beta, matrixC )
    implicit none

    integer, intent(in)       :: size
    complex(8), intent(in)    :: matrixA(size,MPI%smat:MPI%emat)
    complex(8), intent(in)    :: matrixB(size,MPI%smat:MPI%emat)
    complex(8), intent(out)   :: matrixC(size,MPI%smat:MPI%emat)
    complex(8), intent(in)    :: alpha, beta

    call PZHEMM( 'L', 'L', size, size, &
         alpha, matrixA, 1, 1, MPI%SL_DESC, matrixB, 1, 1, MPI%SL_DESC, &
         beta,  matrixC, 1, 1, MPI%SL_DESC )

    return
  end subroutine MPI__ZHEMM

  subroutine MPI__ZPOTRI( size, matrixA )
    implicit none

    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixA(size,MPI%smat:MPI%emat)

    call PZPOTRF( 'L', size, matrixA, 1, 1, MPI%SL_DESC, MPI%info )
    call PZPOTRI( 'L', size, matrixA, 1, 1, MPI%SL_DESC, MPI%info )

    return
  end subroutine MPI__ZPOTRI

  subroutine MPI__ZGETRI( size, matrixA )
    implicit none

    integer, intent(in)       :: size
    complex(8), intent(inout) :: matrixA(size,MPI%smat:MPI%emat)

    integer, allocatable :: ipiv(:)
    complex(8), allocatable :: work1(:)
    integer, allocatable :: work2(:)
    integer :: lwork1, lwork2

    allocate( work1(1), work2(1) )
    lwork1=-1; lwork2=-1

    call PZGETRI( size, matrixA, 1, 1, MPI%SL_DESC, &
         ipiv, work1, lwork1, work2, lwork2, MPI%info )

    lwork1 = int(work1(1))+100
    lwork2 = int(work2(1))+100
    deallocate( work1, work2 )

    allocate( work1(lwork1), work2(lwork2) )
    allocate( ipiv(size) )

    call PZGETRF( size, size, matrixA, 1, 1, MPI%SL_DESC, &
         ipiv, MPI%info )

    call PZGETRI( size, matrixA, 1, 1, MPI%SL_DESC, &
         ipiv, work1, lwork1, work2, lwork2, MPI%info )

    deallocate( ipiv )
    deallocate( work1, work2 )

    return
  end subroutine MPI__ZGETRI
end module ac_mpi_module
