#define NEC_TUNE
!#undef NEC_TUNE
! AAS for Modified Davidson and Modified Kosugi
module m_ES_WF_by_ModifiedDavidson
#ifdef TRANSPOSE
#ifdef VPP
#define _ODD_BOUNDARY_
#endif
#ifdef SX
#define _ODD_BOUNDARY_
#endif
#ifdef NEC_TUNE1
#define _ODD_BOUNDARY_
#endif
#ifdef MPI_FFTW
  use, intrinsic :: iso_c_binding
#endif
  use m_Const_Parameters,    only : DP,SP,DIRECT,ON,OFF,SCF,GAMMA,OTHER_BANDS,SKIP,EXECUT,ELECTRON, OLD
  use m_Parallelization,     only : MPI_CommGroup &
       &                          , myrank_e,myrank_k,map_e,map_k,ista_e,iend_e,istep_e &
       &                          , ista_k,iend_k,np_g1k,ista_g1,mpi_k_world,ierr,map_z &
       &                          , np_e,npes,nrank_e,mype,nrank_g, ista_spin, iend_spin
  use m_Control_Parameters,  only : nspin,ipridavidson,kimg,neg,af,npartition_david &
       &                          , delta_eig_empty &
       &                          , delta_eig_occup &
       &                          , eps_mdkosugi, eps_residual_mdkosugi, sw_fft_xzy &
       &                          , max_iter_david, sw_MRCV_only,eps_david &
       &                          , nblocksize_fftw, nblocksize_fftw_is_given &
       &                          , sw_serial_fft &
       &                          , sw_keep_hloc_phi &
#ifdef SAVE_FFT_TIMES
       &                          , sw_divide_subspace, sw_save_fft, sw_hybrid_functional
#else
       &                          , sw_divide_subspace, sw_hybrid_functional
#endif
#ifdef MPI_FFTW
  use m_Control_Parameters,  only : sw_mpi_fftw
#endif
  use m_Files,               only : nfout
  use m_Timing,              only : tstatc0_begin, tstatc0_end
  use m_FFT,                 only : nfft,fft_box_size_WF, m_FFT_Vlocal_W_3D, m_FFT_Direct_3D &
#ifdef FFT_3D_DIVISION
 &                                , m_FFT_Vlocal_W_3DIV_3D, m_FFT_Direct_3DIV_3D             &
#endif
 &                                , m_FFT_Direct_XYZ_3D
#ifdef MPI_FFTW
  use m_FFT,                 only : m_FFT_Vlocal_W_mpifftw, m_FFT_Direct_MPI_FFTW, afft_mpifftw, bfft_mpifftw &
                                  , m_FFT_Vlocal_W_mpifftw3d, afft_mpifftw_vlocal
#endif
  use m_ES_WF_by_SDorCG,     only : map_fft_to_WF_3D
#ifdef MPI_FFTW
  use m_ES_WF_by_SDorCG,     only : map_fft_to_WF_mpifftw, gen_fft_to_WF_map
  use m_Electronic_Structure, only : m_ES_fftbox_map
#endif
  use m_Kpoints,             only : kv3,vkxyz, k_symmetry
  use m_PlaneWaveBasisSet,   only : kg1, iba, igf, nbase, m_pwBS_kinetic_energies
  use m_Electronic_Structure,only : zaj_l, neordr, nrvf_ordr, eko_l, vlhxcQ &
       &                          , occup_l &
       &                          , fsr_l,fsi_l, vnlph_l, vlhxc_l &
       &                          , m_ES_gather_f_3d_to_2d_k &
       &                          , m_ES_gather_f_3d_to_2d &
#ifdef SAVE_FFT_TIMES
       &                          , status_saved_phifftr &
#endif
       &                          , m_ES_Vlocal_in_Rspace_3D &
       &                          , m_ES_WF_in_Rspace_3D &
       &                          , m_ES_wd_zaj_small_portion_3D &
       &                          , m_ES_wd_eko_3D &
       &                          , m_ES_sort_eigen_values_3D &
       &                          , m_ES_WF_2D &
       &                          , hlocphi_l, vtau_l
#ifdef MPI_FFTW
  use m_Electronic_Structure,only : m_ES_WF_in_Rspace_mpifftw, m_ES_Vlocal_in_Rspace_mpifftw, m_ES_Vlocal_in_Rspace_mpifftw3d
#endif
  use m_ES_ortho,           only : np_g1k_x                                          &
       &                         , m_ES_orthogonal_phi_to_WFs
  use m_ES_nonlocal,        only : m_ES_Vnonlocal_W_3D                               &
       &                         , m_ES_betar_dot_WFs_4_each_k_3D                    &
       &                         , m_ES_alloc_scss_etc_3D                            &
       &                         , m_ES_dealloc_scss_etc                             &
       &                         , m_ES_betar_dot_Psi_4_each_k_3D
  use m_Ionic_System,       only : natm, iwei, ityp, ntyp
  use m_PseudoPotential,    only : ilmt,nlmta,lmta,q,dion &
       &                         , lmtt,ltp,mtp &
       &                         , m_PP_include_vanderbilt_pot &
       &                         , ipaw,dion_paw,modnrm
  use m_NonLocal_Potential, only : snl
  use m_Parallelization,    only : nel_fft_x , nel_fft_y, nel_fft_z        &
       &                         , fft_X_x_nel, fft_X_y_nel, fft_X_z_nel   &
       &                         , mp_g1k, myrank_g                        &
       &                         , ista_g1k, iend_g1k, neg_g, mpi_ke_world &
       &                         , mpi_kg_world            &
       &                         , np_fs, myrank_g, nis_fs
#ifdef NEC_TUNE
  use m_Parallelization,    only : ista_atm, iend_atm, ista_fs, iend_fs, np_fs
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
use mod_timer
#endif
! === TIMERTIMERTIMER ==========================================================


!!$! ============================== added by K. Tagami ================== 11.0
!!$  use m_Const_Parameters,   only : CMPLDP, Neglected
!!$  use m_Control_Parameters,  only : ndim_spinor, noncol, ndim_chgpot, SpinOrbit_mode, &
!!$       &                            sw_hubbard
!!$  use m_PseudoPotential,     only : q_noncl, dion_scr_noncl
!!$!  use m_Electronic_Structure,  only : m_ES_Vlocal_in_Rspace_noncl
!!$
!!$!  use m_FFT,                 only : m_FFT_Vlocal_W_noncl
!!$!  use m_ES_ortho,              only : m_ES_orthogonl_SD_to_WFs_noncl
!!$!  use m_Electronic_Structure,      only : m_ES_sort_eigen_vals_noncl
!!$! ==================================================================== 11.0

  use m_ES_ExactExchange, only : m_ES_Vexx_W,m_ES_EXX_potential &
     , m_ES_EXX_gather_valence_states

  use m_Control_Parameters,  only : use_metagga, vtau_exists
  use m_ES_WF_by_SDorCG,     only : m_ES_contrib_kindens_to_vnlph, &
       &                            m_ES_kindens_to_vnlph_ib, m_ES_kindens_to_vnlph_ib2
#ifdef MPI_FFTW
  use m_ES_WF_by_SDorCG,     only : m_ES_con_kindens_to_vnlph_mpfw, &
       &                            m_ES_kindens_to_vnlph_ib_mpfw, &
       &                            m_ES_kindens_to_vnlph_ib2_mpfw
#endif
  use mpi

  implicit none
  integer, private, parameter                         :: sw_timing_2ndlevel = ON

  private

  integer, allocatable, dimension(:) :: nsize_subspace, nsize_matrix
  integer, allocatable, dimension(:) :: ista_e_l,iend_e_l,ielm_e_l
  integer :: nsize_sb_now, nsize_mt_now, nsize_mt_old,msize_matrix
  integer :: nblock,msize_subspace
  real(kind=DP), allocatable, target, dimension(:) :: w1hw2,w1hw2_mpi
  real(kind=DP), allocatable, target, dimension(:) :: w1sw2,w1sw2_mpi
  real(kind=DP), allocatable, target, dimension(:,:,:,:) :: zat_l
  real(kind=DP), allocatable, target, dimension(:,:,:,:) :: zah_l
  real(kind=DP), allocatable, target, dimension(:,:,:) :: fsr,fsi
  real(kind=DP), pointer, dimension(:,:,:) :: fsr_p,fsi_p
  real(kind=DP), allocatable, target, dimension(:,:,:) :: fsr_t,fsi_t
  real(kind=DP), allocatable, dimension(:,:,:) :: fsr_mpi,fsi_mpi
  real(kind=DP), pointer, dimension(:,:,:,:) :: zat_l_p, zah_l_p
  real(kind=DP), allocatable, target, dimension(:,:,:,:) :: zat_l_t,zah_l_t
  logical, allocatable, target, dimension(:)   :: feigconv_t
  logical, allocatable, dimension(:)   :: feigconv_mpi
  logical, pointer, dimension(:)   :: feigconv_p
  real(kind=DP), allocatable, dimension(:,:,:) :: zajold_l
  real(kind=DP), allocatable, dimension(:,:,:) :: zaj_l_backup
  real(kind=DP), allocatable, dimension(:,:) :: fsrold_l,fsiold_l
  real(kind=DP), allocatable, dimension(:,:,:,:):: wfsd_l  !d(kg1,np_e,ik:ik,kimg)
  real(kind=DP), allocatable, dimension(:,:,:)  :: bsdr_l, bsdi_l !d(np_e,nlmta,1)
  real(kind=DP) :: eps_residual
  logical, allocatable, target, dimension(:)   :: feigconv
  integer, allocatable, dimension(:,:) :: ibover

  real(kind=DP), allocatable,dimension(:) ::     eko_d
  real(kind=DP), allocatable,dimension(:) ::     eko_d_mpi
  integer, allocatable,dimension(:) ::     occup

  public :: m_ESmddavid_Renew_WF
  public :: m_ESmddavid_Subspace_Rotation

! --------------------
! non-collinear
! --------------------
  real(kind=DP), allocatable, dimension(:,:,:,:,:) :: zat_l_noncl
  real(kind=DP), allocatable, dimension(:,:,:,:,:) :: zah_l_noncl
  real(kind=DP), allocatable, dimension(:,:,:,:) :: fsr_noncl, fsi_noncl
  real(kind=DP), allocatable, dimension(:,:,:,:) :: zaj_l_backup_noncl
!
  real(kind=DP), allocatable, dimension(:,:,:,:) :: zajold_l_noncl
  real(kind=DP), allocatable, dimension(:,:,:) :: fsrold_l_noncl,fsiold_l_noncl

! --------------------
! meta-gga
! --------------------
  real(kind=DP), allocatable :: cfft_l(:)
  real(kind=DP), allocatable :: cfft_mpifftw_vlocal(:,:,:)


  integer :: icountocc,icountuocc
  real(kind=DP), allocatable, dimension(:,:) :: eko_l_old

#ifdef MPI_FFTW
  include 'fftw3-mpi.f03'
#endif
!  include 'mpif.h'

contains

  subroutine SD_direction_3D( precon, ik, ibo, ekin_l, p_l, itot, lsize, &
       &                      ng, vexx, vtau_phl, VlocalW )
    integer     , intent(in)                   :: precon,ik,ibo,itot
    real(kind=DP), intent(in), dimension(maxval(np_g1k))  :: ekin_l
!!#ifdef FFT_3D_DIVISION
!!    real(kind=DP), intent(in), dimension(lsize*2   ,1) :: VlocalW
!!#else
!!    real(kind=DP), intent(in), dimension(lsize*kimg,1) :: VlocalW
!!#endif
    real(kind=DP)             , dimension(maxval(np_g1k))  :: p_l
    integer, intent(in) :: lsize,ng
    real(kind=DP), dimension(ng,kimg), intent(in) :: vexx
    real(kind=DP), intent(in), optional :: vtau_phl(maxval(np_g1k),kimg)

#ifdef FFT_3D_DIVISION
    real(kind=DP), intent(in), optional, dimension(lsize*2   ,1) :: VlocalW
#else
    real(kind=DP), intent(in), optional, dimension(lsize*kimg,1) :: VlocalW
#endif

    integer       :: i, ib, iadd
    real(kind=DP) :: devr,denom, e1, devi, norm, eko
    integer :: id_sname = -1

!   ib = map_z(ibo)                                  ! MPI
    call tstatc0_begin('SD_direction_3D ',id_sname)
    ib = ibo
    denom = 1.d0/product(fft_box_size_WF(1:3,1))
    norm = 0.d0
    eko = eko_l(ib,ik)

    if(itot==1) then
       if(sw_keep_hloc_phi==ON) then
          if(kimg == 1) then
             do i=ista_g1k(ik),iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                zah_l(iadd,ib,1,1) = hlocphi_l(iadd,ib,ik,1)
             enddo
             if(sw_hybrid_functional==ON) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
             endif
             if( use_metagga .and. vtau_exists ) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) +vtau_phl(:,1)
             endif
          else
             do i=ista_g1k(ik),iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                zah_l(iadd,ib,1,   1)= hlocphi_l(iadd,ib,ik,1)
                zah_l(iadd,ib,kimg,1)= hlocphi_l(iadd,ib,ik,kimg)
             enddo
             if(sw_hybrid_functional==ON) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vexx(:,kimg)
             endif
             if( use_metagga .and. vtau_exists ) then
                zah_l(:,ib,1,1)    = zah_l(:,ib,1,1)    +vtau_phl(:,1)
                zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) +vtau_phl(:,kimg)
             endif
          endif
       else
          if(kimg == 1) then
             do i = ista_g1k(ik), iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                devr  = ekin_l(iadd)*zaj_l(iadd,ib,ik,1)&
                     & + VlocalW(iadd,1)*denom
                zah_l(iadd,ib,1,1) = devr
             end do
             if(sw_hybrid_functional==ON) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
             endif
             if( use_metagga .and. vtau_exists ) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vtau_phl(:,1)
             endif
          else if(kimg == 2) then
             do i = ista_g1k(ik), iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                e1    = ekin_l(iadd)
                devr  = e1*zaj_l(iadd,ib,ik,1) + VlocalW(2*iadd-1,1)*denom
                devi  = e1*zaj_l(iadd,ib,ik,2) + VlocalW(2*iadd,  1)*denom
                zah_l(iadd,ib,1,   1) = devr
                zah_l(iadd,ib,kimg,1) = devi
             end do
             if(sw_hybrid_functional==ON) then
                zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vexx(:,kimg)
             endif
             if( use_metagga .and. vtau_exists ) then
                zah_l(:,ib,1,1)    = zah_l(:,ib,1,1)    + vtau_phl(:,1)
                zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vtau_phl(:,kimg)
             endif
          endif
       endif
    endif

    if(kimg == 1) then
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          devr = zah_l(iadd,ib,1,1)+vnlph_l(iadd,ib,1) - eko*zaj_l(iadd,ib,ik,1)
          if(sw_hybrid_functional==ON) devr = devr-vexx(iadd,1)
          if( use_metagga .and. vtau_exists ) then
             devr = devr -vtau_phl(iadd,1)
          endif
          wfsd_l(iadd,ib,ik,1) = - devr
          norm = norm + devr*devr
       end do
    else if(kimg == 2) then
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          e1    = ekin_l(iadd) - eko
          devr = zah_l(iadd,ib,1,1)+vnlph_l(iadd,ib,1) - eko*zaj_l(iadd,ib,ik,1)
          devi = zah_l(iadd,ib,2,1)+vnlph_l(iadd,ib,2) - eko*zaj_l(iadd,ib,ik,2)
          if(sw_hybrid_functional==ON) then
            devr = devr-vexx(iadd,1)
            devi = devi-vexx(iadd,2)
          endif
          if( use_metagga .and. vtau_exists ) then
             devr = devr -vtau_phl(iadd,1)
             devi = devi -vtau_phl(iadd,2)
          endif
          wfsd_l(iadd,ib,ik,1) = - devr
          wfsd_l(iadd,ib,ik,2) = - devi
          norm = norm + devr*devr + devi*devi
       end do
       if(k_symmetry(ik) == GAMMA) then
          if(ista_g1k(ik) == 1) then
             devr=wfsd_l(1,ib,ik,1)
             devi=wfsd_l(1,ib,ik,2)
             norm = norm*2.d0 - devr*devr - devi*devi
          end if
       end if
    end if
    call mpi_allreduce(MPI_IN_PLACE,norm,1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)

!    feigconv(ib)=.false.
!    if(sqrt(norm) .lt. eps_residual) feigconv(ib)=.true.

    if(precon==ON) then
      call decide_precon_factor_wfsd_3D(ik,ibo,ekin_l,p_l)
!call decide_precon_factor_david(ik,hdiag,sdiag,eko_l(ib,ik),p)
      if(kimg == 1) then
         do i = ista_g1k(ik), iend_g1k(ik)
            iadd = i - ista_g1k(ik) + 1
            wfsd_l(iadd,ib,ik,1) = p_l(iadd)*wfsd_l(iadd,ib,ik,1)
         end do
      else if(kimg == 2) then
         do i = ista_g1k(ik), iend_g1k(ik)
            iadd = i - ista_g1k(ik) + 1
            wfsd_l(iadd,ib,ik,1) = p_l(iadd)*wfsd_l(iadd,ib,ik,1)
            wfsd_l(iadd,ib,ik,2) = p_l(iadd)*wfsd_l(iadd,ib,ik,2)
         end do
      end if
    end if
    call tstatc0_end(id_sname)
  end subroutine SD_direction_3D

  subroutine SD_direction_3D2( precon, ik, ib1, ib2, ibesize, ekin_l, p_l, &
       &                       itot, lsize, ng, vexx, vtau_phl, VlocalW )
    integer     , intent(in)                   :: precon,ik,ib1,ib2,ibesize,itot
    real(kind=DP), intent(in), dimension(maxval(np_g1k))  :: ekin_l
    real(kind=DP)             , dimension(maxval(np_g1k))  :: p_l
    integer, intent(in) :: lsize,ng
    real(kind=DP), dimension(ng,kimg), intent(in) :: vexx
    real(kind=DP), intent(in), optional :: vtau_phl(maxval(np_g1k),kimg)
#ifdef FFT_3D_DIVISION
    real(kind=DP), intent(in), optional, dimension(lsize*2   ,1) :: VlocalW
#else
    real(kind=DP), intent(in), optional, dimension(lsize*kimg,1) :: VlocalW
#endif

    integer       :: i, ib, iadd
    real(kind=DP) :: devr,denom, e1, devi, norm
    real(kind=DP),allocatable,dimension(:) :: normb
    integer :: id_sname = -1

    call tstatc0_begin('SD_direction_3D ',id_sname)
!   ib = map_z(ibo)                                  ! MPI
!!$    ib = ibo
    denom = 1.d0/product(fft_box_size_WF(1:3,1))
!!$    norm = 0.d0
    allocate(normb(ib1:ib2)); normb = 0.d0

    if(itot==1) then
       if(sw_keep_hloc_phi==ON) then
          if(kimg == 1) then
             do ib=ib1,ib2
                do i=ista_g1k(ik),iend_g1k(ik)
                   iadd = i - ista_g1k(ik) + 1
                   zah_l(iadd,ib,1,1) = hlocphi_l(iadd,ib,ik,1)
                enddo
                if(sw_hybrid_functional==ON) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                endif
                if( use_metagga .and. vtau_exists ) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vtau_phl(:,1)
                endif
             enddo
          else
             do ib=ib1,ib2
                do i=ista_g1k(ik),iend_g1k(ik)
                   iadd = i - ista_g1k(ik) + 1
                   zah_l(iadd,ib,1,   1)= hlocphi_l(iadd,ib,ik,1)
                   zah_l(iadd,ib,kimg,1)= hlocphi_l(iadd,ib,ik,2)
                enddo
                if(sw_hybrid_functional==ON) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                   zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vexx(:,kimg)
                endif
                if( use_metagga .and. vtau_exists ) then
                   zah_l(:,ib,1,1)    = zah_l(:,ib,1,1)    + vtau_phl(:,1)
                   zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vtau_phl(:,kimg)
                endif
             enddo
          endif
       else
          if(kimg == 1) then
             do ib=ib1,ib2
                do i = ista_g1k(ik), iend_g1k(ik)
                   iadd = i - ista_g1k(ik) + 1
                   devr  = ekin_l(iadd)*zaj_l(iadd,ib,ik,1)&
                        & + VlocalW(iadd,1)*denom
                   zah_l(iadd,ib,1,1) = devr
                end do
                if(sw_hybrid_functional==ON) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                endif
                if( use_metagga .and. vtau_exists ) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vtau_phl(:,1)
                endif
             enddo
          else if(kimg == 2) then
             do ib=ib1,ib2
                do i = ista_g1k(ik), iend_g1k(ik)
                   iadd = i - ista_g1k(ik) + 1
                   e1    = ekin_l(iadd)
                   devr  = e1*zaj_l(iadd,ib,ik,1) + VlocalW(2*iadd-1,1)*denom
                   devi  = e1*zaj_l(iadd,ib,ik,2) + VlocalW(2*iadd,  1)*denom
                   zah_l(iadd,ib,1,   1) = devr
                   zah_l(iadd,ib,kimg,1) = devi
                end do
                if(sw_hybrid_functional==ON) then
                   zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
                   zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vexx(:,kimg)
                endif
                if( use_metagga .and. vtau_exists ) then
                   zah_l(:,ib,1,1)    = zah_l(:,ib,1,1)    + vtau_phl(:,1)
                   zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vtau_phl(:,kimg)
                endif
             enddo
          endif
       endif
    endif

    if(kimg == 1) then
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          do ib = ib1, ib2
             devr  = zah_l(iadd,ib,1,1) + vnlph_l(iadd,ib,1)-eko_l(ib,ik)*zaj_l(iadd,ib,ik,1)
             if(sw_hybrid_functional==ON) then
                devr = devr - vexx(i,1)
             endif
             if( use_metagga .and. vtau_exists ) then
                devr = devr - vtau_phl(i,1)
             endif
             wfsd_l(iadd,ib,ik,1) = - devr
             normb(ib) = normb(ib) + devr*devr
          end do
       end do
    else if(kimg == 2) then
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          do ib = ib1, ib2
             devr = zah_l(iadd,ib,1,1)+vnlph_l(iadd,ib,1) - eko_l(ib,ik) * zaj_l(iadd,ib,ik,1)
             devi = zah_l(iadd,ib,2,1)+vnlph_l(iadd,ib,2) - eko_l(ib,ik) * zaj_l(iadd,ib,ik,2)
             if(sw_hybrid_functional==ON) then
                devr = devr - vexx(i,1)
                devi = devi - vexx(i,2)
             endif
             if( use_metagga .and. vtau_exists ) then
                devr = devr - vtau_phl(i,1)
                devi = devi - vtau_phl(i,2)
             endif
             wfsd_l(iadd,ib,ik,1) = - devr
             wfsd_l(iadd,ib,ik,2) = - devi
             normb(ib) = normb(ib) + devr*devr + devi*devi
          end do
       end do
       if(k_symmetry(ik) == GAMMA) then
          if(ista_g1k(ik) == 1) then
             do ib = ib1, ib2
                devr=wfsd_l(1,ib,ik,1)
                devi=wfsd_l(1,ib,ik,2)
                normb(ib) = normb(ib)*2.d0 - devr*devr - devi*devi
             end do
          end if
       end if
    end if
    call mpi_allreduce(MPI_IN_PLACE,normb,ib2-ib1+1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)

    do ib = ib1, ib2
!       feigconv(ib)=.false.
!       if(sqrt(normb(ib)) .lt. eps_residual) feigconv(ib)=.true.

       if(precon==ON) then
!!$      call decide_precon_factor_wfsd_3D(ik,ibo,ekin_l,p_l)
          call decide_precon_factor_wfsd_3D(ik,ib,ekin_l,p_l)
!call decide_precon_factor_david(ik,hdiag,sdiag,eko_l(ib,ik),p)
          if(kimg == 1) then
             do i = ista_g1k(ik), iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                wfsd_l(iadd,ib,ik,1) = p_l(iadd)*wfsd_l(iadd,ib,ik,1)
             end do
          else if(kimg == 2) then
             do i = ista_g1k(ik), iend_g1k(ik)
                iadd = i - ista_g1k(ik) + 1
                wfsd_l(iadd,ib,ik,1) = p_l(iadd)*wfsd_l(iadd,ib,ik,1)
                wfsd_l(iadd,ib,ik,2) = p_l(iadd)*wfsd_l(iadd,ib,ik,2)
             end do
          end if
       end if
    end do
    deallocate(normb)
    call tstatc0_end(id_sname)
  end subroutine SD_direction_3D2

  subroutine decide_precon_factor_wfsd_3D(ik,ibo,ekin_l,p_l)
    integer, intent(in)                         :: ik,ibo
    real(kind=DP), intent(in),  dimension(maxval(np_g1k))  :: ekin_l
    real(kind=DP), intent(out), dimension(maxval(np_g1k))  :: p_l

    integer       :: i, iadd
    real(kind=DP) :: ektot, x, x1, x2, d_ektot

!    call kinetic_energy_wfsd(ik,ibo,ekin,ektot)   ! -here
    call kinetic_energy_3D(ik,ibo,ekin_l,ektot)   ! -here
    d_ektot = 4.d0/ektot/3.d0
    p_l = 0.d0
    do i = ista_g1k(ik), iend_g1k(ik)
       iadd = i - ista_g1k(ik) + 1
       x = ekin_l(iadd)*d_ektot
       x1 = (x*x+9.d0)*(x+3.d0)
       x2 = (x*x)*(x*x)
       p_l(iadd)  = x1/(x1 + x2 )
    end do
!    p=p*d_ektot
  end subroutine decide_precon_factor_wfsd_3D

  subroutine kinetic_energy_wfsd(ik,ibo,dekin,ektot)
    integer, intent(in) :: ik, ibo
    real(kind=DP), intent(in), dimension(kg1)  :: dekin
    real(kind=DP), intent(out)                 :: ektot
    integer  :: i, ri, ib
    ektot = 0.d0
    ib=map_z(ibo)
    do ri = 1, kimg
       do i = 1, iba(ik)
          ektot = ektot + dekin(i)*wfsd_l(i,ib,ik,ri)**2   ! MPI
       end do
    end do

    if(k_symmetry(ik) == GAMMA) ektot = ektot*2.d0

  end subroutine kinetic_energy_wfsd

  subroutine kinetic_energy_3D(ik,ibo,dekin,ektot)
    integer, intent(in) :: ik, ibo
    real(kind=DP), intent(in), dimension(maxval(np_g1k)) :: dekin
    real(kind=DP), intent(out)                 :: ektot
    integer  :: i, ib, iadd
    ektot = 0.d0
!   ib = map_z(ibo)
    ib = ibo
    if(kimg == 1) then
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          ektot = ektot + dekin(iadd)*zaj_l(iadd,ib,ik,1)**2
       end do
    else
       do i = ista_g1k(ik), iend_g1k(ik)
          iadd = i - ista_g1k(ik) + 1
          ektot = ektot + dekin(iadd)*( zaj_l(iadd,ib,ik,1)**2 &
               &                      + zaj_l(iadd,ib,ik,2)**2)
       end do
    end if
    call mpi_allreduce(MPI_IN_PLACE,ektot,1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
    if(k_symmetry(ik) == GAMMA)  ektot = ektot*2.d0
  end subroutine kinetic_energy_3D



  subroutine normalize_wfsd_3D(ik)
    integer,intent(in) :: ik
    real(kind=DP) :: norm, wfsdr, wfsdi
    integer :: ib1,ii,ib,iadd
    integer :: id_sname=-1
    call tstatc0_begin('normalize_wfsd_3D ',id_sname)
    do ib1 = 1, np_e
!      ib=map_z(ib1)
       ib=ib1
       norm = 0.d0
       if(kimg==1) then
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             wfsdr = wfsd_l(iadd,ib,ik,kimg)
             norm = norm + wfsdr*wfsdr
          end do
          call mpi_allreduce(MPI_IN_PLACE,norm,1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
          norm = 1.d0/sqrt(norm)
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             wfsd_l(iadd,ib,ik,1) = wfsd_l(iadd,ib,ik,1)*norm
          end do
!!$          bsdr_l(ib,1:nlmta,ik) = bsdr_l(ib,1:nlmta,ik)*norm
!!$          bsdi_l(ib,1:nlmta,ik) = bsdi_l(ib,1:nlmta,ik)*norm
          bsdr_l(ib,:,ik) = bsdr_l(ib,:,ik)*norm
          bsdi_l(ib,:,ik) = bsdi_l(ib,:,ik)*norm
       else
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             wfsdr = wfsd_l(iadd,ib,ik,1   )
             wfsdi = wfsd_l(iadd,ib,ik,kimg)
             norm = norm + wfsdr*wfsdr+wfsdi*wfsdi
          end do
          call mpi_allreduce(MPI_IN_PLACE,norm,1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
          norm = 1.d0/sqrt(norm)
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             wfsd_l(iadd,ib,ik,1) = wfsd_l(iadd,ib,ik,1)*norm
             wfsd_l(iadd,ib,ik,2) = wfsd_l(iadd,ib,ik,2)*norm
          enddo
          if(k_symmetry(ik) == GAMMA) then
            bsdr_l(ib,:,ik) = bsdr_l(ib,:,ik)*norm
          else
            bsdr_l(ib,:,ik) = bsdr_l(ib,:,ik)*norm
            bsdi_l(ib,:,ik) = bsdi_l(ib,:,ik)*norm
          end if
       end if
    end do
    call tstatc0_end(id_sname)
  end subroutine normalize_wfsd_3D



  subroutine m_ESmddavid_Subspace_Rotation(nfout)
    integer, intent(in) :: nfout

    integer             :: ispin, ik, iksnl, switch_of_eko_part
    real(kind=DP), allocatable, dimension(:) ::  afft, bfft
    real(kind=DP), allocatable, dimension(:) :: ekin
    real(kind=DP), allocatable, dimension(:) :: afft_l
    real(kind=DP), allocatable, dimension(:,:) :: wk_bfft_l
    real(kind=DP), allocatable, dimension(:,:) :: bfft_l
    integer :: lsize, ibsize, isrsize, fft_l_size
    real(kind=DP), allocatable, dimension(:) :: ekin_l
    integer :: ipri0
    integer :: ng
    real(kind=DP), allocatable, dimension(:,:,:) :: vexx
    real(kind=DP) :: exx
#ifdef FFT_3D_DIVISION
    lsize = fft_X_x_nel*fft_X_y_nel*fft_X_z_nel
!.. allocate(afft_l(lsize*2), stat=ierr)
#else
    lsize = max(maxval(nel_fft_x(:)),maxval(nel_fft_y(:)),maxval(nel_fft_z(:)))
!.. allocate(afft_l(lsize*kimg), stat=ierr)
#endif
    if(ierr /= 0) then
       write(nfout,*)' m_ESmddavid_Subspace_Rotation: Not allocated afft_l array'
       call flush(nfout)
       call mpi_abort(mpi_comm_world, 201, ierr)
    endif
    ibsize = 1

    allocate(ekin_l(maxval(np_g1k)))
    call m_ES_alloc_scss_etc_3D()
    allocate(afft(nfft)); allocate(bfft(nfft))
    call allocate_fsri_3D

    if ( use_metagga .and. vtau_exists ) then
#ifdef FFT_3D_DIVISION
       allocate(cfft_l(lsize*2) ,stat=ierr)
#else
       allocate(cfft_l(lsize*kimg) ,stat=ierr)
#endif
    endif

    do ispin = 1, nspin, (af+1)
       call m_ES_Vlocal_in_Rspace_3D(ispin,afft_l,lsize,1,OFF)      ! (ptfft1) vlhxc_l->afft
       if ( use_metagga .and. vtau_exists ) then
          call m_ES_Vlocal_in_Rspace_3D( ispin, cfft_l, lsize, 1, OFF, vtau_l ) ! r space
       endif

       do ik = ispin, kv3-nspin+ispin, nspin
          if(map_k(ik) /= myrank_k) cycle          ! MPI
          isrsize = min(lsize,mp_g1k(ik))
          fft_l_size  = nel_fft_x(myrank_g)
#ifdef FFT_3D_DIVISION
          allocate(wk_bfft_l(lsize*2   ,ibsize) ,stat=ierr)
          allocate(bfft_l(lsize*2   ,ibsize) ,stat=ierr)
#else
          allocate(wk_bfft_l(lsize*kimg,ibsize) ,stat=ierr)
          allocate(bfft_l(lsize*kimg,ibsize) ,stat=ierr)
#endif
          if (ierr /= 0) then
             write(nfout,*)' m_ESmddavid_Subspace_Rotation:  Not allocate '
             call flush(nfout)
             call mpi_abort(mpi_comm_world, 205, ierr)
          endif
          iksnl = (ik-1)/nspin + 1

          call allocate_t_matrix_sr_3D(ik) ! -> np_g1k_x
          call m_pwBS_kinetic_energies(ik,vkxyz,ekin_l) ! (diakin) ->ekin
          call m_ES_Vnonlocal_W_3D(ik,iksnl,ispin,switch_of_eko_part=OFF) ! -> vnlph_l

          if ( use_metagga .and. vtau_exists ) &
               &           call m_ES_contrib_kindens_to_vnlph( ispin, ik, lsize, cfft_l )

          if(sw_hybrid_functional==ON) call m_ES_Vexx_W(ik)
          call allreduce_fs_sr_3D(ik) ! -> fsr,fsi
          call evolve_WFs_in_subspace_sr_3D& !-(m_ES_WF_by_ModifiedDavidson)
                                      &(ik,ispin,ekin_l,afft_l,bfft_l, &
                                        wk_bfft_l,lsize,ibsize,isrsize,fft_l_size) !-> zaj_l
          if(ik==1.and.ipridavidson>= 2) &
            & call m_ES_wd_zaj_small_portion_3D(nfout,ik," -- after davidson subspace rotation --",21)
          call m_ES_betar_dot_WFs_4_each_k_3D(nfout,ik)   ! -> fsr_l,fsi_l
          if(ipridavidson>=2) then
             write(nfout,'("Davidson Subspace Rotation: ik=",i5," subspace=",i5)') ik, nsize_sb_now
          end if
          call deallocate_t_matrix_sr
          deallocate(wk_bfft_l)
          deallocate(bfft_l)

       enddo      ! k-point loop
    enddo      ! spin loop

    call deallocate_fsri
    if ( allocated(cfft_l) ) deallocate( cfft_l )

!!  ( in case of af=1 )
    if(af /= 0) then
       call cp_eigen_values_for_af       !-(contained here)
       call expand_neordr_and_nrvf_ordr  !-(contained here)
    end if

    call get_ipri0(ipridavidson,ipri0)
    if(ipri0 >= 2) call m_ES_wd_eko_3D(nfout,mode=SCF)

    deallocate(bfft);   deallocate(afft)
    call m_ES_dealloc_scss_etc()
    deallocate(ekin_l)

  contains

    subroutine get_ipri0(ipri_in, ipri_out)
      integer, intent(in)  :: ipri_in
      integer, intent(out) :: ipri_out
      if(npes > 1) then
         if(mype == 0) ipri_out = ipri_in
         call mpi_bcast(ipri_out,1,mpi_integer,0,MPI_CommGroup,ierr)
      else
         ipri_out = ipri_in
      end if
    end subroutine get_ipri0

    subroutine cp_eigen_values_for_af
      integer :: ik,ib
      do ik = 1, kv3, af+1
         if(map_k(ik) /= myrank_k) cycle    ! MPI
         do ib = 1, np_e                    ! MPI
            eko_l(ib,ik+af) = eko_l(ib,ik)
         enddo
      enddo
    end subroutine cp_eigen_values_for_af

    subroutine expand_neordr_and_nrvf_ordr
      integer :: ik
      do ik = 1, kv3, af+1
         if(map_k(ik) /= myrank_k) cycle     ! MPI
         neordr(1:neg,ik+af) = neordr(1:neg,ik)
         nrvf_ordr(1:neg,ik+af) = nrvf_ordr(1:neg,ik)
      end do
    end subroutine expand_neordr_and_nrvf_ordr

  end subroutine m_ESmddavid_Subspace_Rotation



  subroutine allocate_fsri_3D
    nsize_sb_now = neg
    nsize_mt_now =  nsize_sb_now*(nsize_sb_now+1)/2
    allocate(fsr(neg,nlmta,1))
    if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
       allocate(fsi(neg,nlmta,1))
    end if
    allocate(zaj_l_backup(maxval(np_g1k),np_e,kimg)) ! MPI
  end subroutine allocate_fsri_3D

  subroutine deallocate_fsri
    deallocate(fsr)
    if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
       deallocate(fsi)
    end if
    deallocate(zaj_l_backup)
  end subroutine deallocate_fsri


  subroutine allocate_t_matrix_sr_3D(ik)
    integer, intent(in) :: ik
    integer :: kimg_t
    if(k_symmetry(ik) == GAMMA) then
       kimg_t = 1
    else
       kimg_t = kimg
    end if
#ifdef _ODD_BOUNDARY_
    if(mod(np_g1k(ik),2) == 0) then
       np_g1k_x = np_g1k(ik) + 1
    else
       np_g1k_x = np_g1k(ik)
    end if
#else
    np_g1k_x = np_g1k(ik)
#endif
    allocate(zat_l(maxval(np_g1k),neg,kimg,1)) ! MPI
    allocate(zah_l(maxval(np_g1k),np_e,kimg,1)) ! MPI
    allocate(w1hw2(nsize_mt_now*kimg_t))
    allocate(w1sw2(nsize_mt_now*kimg_t))
    if(npes>1) then
       allocate(w1hw2_mpi(nsize_mt_now*kimg_t))
       allocate(w1sw2_mpi(nsize_mt_now*kimg_t))
    end if
    zaj_l_backup(:,:,:) = zaj_l(:,:,ik,:)
  end subroutine allocate_t_matrix_sr_3D

  subroutine deallocate_t_matrix_sr
! ============================ modified by K. Tagami ========== 11.0
!    deallocate(zat_l) ! MPI
!    deallocate(zah_l) ! MPI

       deallocate(zat_l) ! MPI
       deallocate(zah_l) ! MPI
! ============================================================== 11.0
    deallocate(w1hw2)
    deallocate(w1sw2)
    if(npes>1) then
       deallocate(w1hw2_mpi)
       deallocate(w1sw2_mpi)
    end if
  end subroutine deallocate_t_matrix_sr

  subroutine allreduce_fs_sr_3D(ik)
    integer, intent(in) :: ik
    integer :: ib,ib1,kimg_t,is,is1
    real(kind=DP), allocatable, dimension(:,:) :: fs_mpi

    allocate(fs_mpi(neg,nlmta))

    fs_mpi=0.d0
    do is = 1, np_fs ! MPI
       is1=nis_fs(myrank_g)+is-1
       do ib = 1, np_e ! MPI
          ib1=neg_g(ib)
          fs_mpi(ib1,is1) = fsr_l(ib,is,ik)
       end do
    end do
    call mpi_allreduce(MPI_IN_PLACE,fs_mpi,neg*nlmta &
      & ,mpi_double_precision,mpi_sum,mpi_k_world(myrank_k),ierr)       ! MPI
    fsr(1:neg,1:nlmta,1) = fs_mpi(1:neg,1:nlmta)
    if(.not. k_symmetry(ik) == GAMMA) then
       fs_mpi=0.d0
       do is = 1, np_fs ! MPI
          is1=nis_fs(myrank_g)+is-1
          do ib = 1, np_e ! MPI
             ib1=neg_g(ib)
             fs_mpi(ib1,is1) = fsi_l(ib,is,ik)
          end do
       end do
       call mpi_allreduce(MPI_IN_PLACE,fs_mpi,neg*nlmta &
         & ,mpi_double_precision,mpi_sum,mpi_k_world(myrank_k),ierr)       ! MPI
       fsi(1:neg,1:nlmta,1) = fs_mpi(1:neg,1:nlmta)
    end if
    deallocate(fs_mpi)
  end subroutine allreduce_fs_sr_3D

  subroutine evolve_WFs_in_subspace_sr_3D(ik,ispin,ekin_l,afft_l,bfft_l, &
                                          wk_bfft_l,lsize,ibsize,isrsize,fft_l_size)
    integer, intent(in) :: ik,ispin,lsize,ibsize,isrsize,fft_l_size
    real(kind=DP), intent(in)  :: ekin_l(maxval(np_g1k))
#ifdef FFT_3D_DIVISION
    real(kind=DP), intent(in)  :: afft_l(lsize*2   )
    real(kind=DP), intent(out) :: bfft_l(lsize*2   ,1)
    real(kind=DP), intent(inout) :: wk_bfft_l(lsize*2   ,ibsize)
#else
    real(kind=DP), intent(in)  :: afft_l(lsize*kimg)
    real(kind=DP), intent(out) :: bfft_l(lsize*kimg,1)
    real(kind=DP), intent(inout) :: wk_bfft_l(lsize*kimg,ibsize)
#endif
    integer :: iadd, ib2_
! (allocatable variables)
    real(kind=DP), allocatable,dimension(:) ::     eig
    real(kind=DP), allocatable,dimension(:,:) ::   vec

    integer       :: ib1,ib2,ib1to,ib2to,i1,ii,ri,ib
!    integer       :: ibb1,ibb2
!    integer       :: ii1,ii2,iter,iter1,iter2
    real(kind=DP) :: denom, eko1, eko2, ekod
    real(kind=DP) :: hr2,hi2,dr1,dr2,di1,di2,dd
    integer :: ip0,ip0b,ip1,ip1b,ib1n,ib2n,ndata,nshift,kimg_t,ig1
    integer :: noffset
!    integer :: nsize_max_sb_now
    integer :: ierr_diag
    integer :: id_sname = -1, ipri0
    call tstatc0_begin('evolve_WFs_in_subspace_sr_3D(davidson) ', id_sname,1)

    call get_ipri0(ipridavidson,ipri0)

    denom = 1.d0/product(fft_box_size_WF(1:3,1))
    if(k_symmetry(ik) == GAMMA) then
       kimg_t = 1
    else
       kimg_t = kimg
    end if

!    nsize_sb_now = nsize_subspace(1)
!    nsize_mt_now = nsize_matrix(1)

    allocate(eig(nsize_sb_now)); eig=0.d0
    allocate(vec(nsize_sb_now*kimg_t,nsize_sb_now))
    allocate(eko_d(neg));     eko_d = 0.d0
    allocate(occup(neg)); occup=0

    if(ipridavidson >=2) then
       write(nfout,*) 'MdDavidson Subspace Rotation:ik,nsize_sb_now=', ik,nsize_sb_now
    end if


    do ib1 = 1, np_e
       eko_d(neg_g(ib1)) = eko_l(ib1,ik)  ! MPI
    end do
    call mpi_allreduce(MPI_IN_PLACE,eko_d,neg,mpi_double_precision,mpi_sum &
         & ,mpi_kg_world,ierr) ! MPI
    eko1 = sum(eko_d(1:neg))

    do ib1=1,neg
       if(map_e(ib1) == myrank_e) then !MPI
          if( occup_l(map_z(ib1),ik) > 0.d0 ) occup(ib1) = 1  ! MPI
       end if
    end do
    call mpi_allreduce(MPI_IN_PLACE,occup,neg,mpi_integer,mpi_sum &
         & ,mpi_k_world(myrank_k),ierr)       ! MPI

! (zaj_l <- (T+Vloc)|phi> )
!!    zaj_l(:,:,ik,:) = zajold_l(:,:,:,idavid)
!( tenchi ) (zat_l <- zaj_l)
    zat_l(:,:,:,1) = 0.0d0
    do ib1 = 1, np_e
       zat_l(:,neg_g(ib1),:,1) = zaj_l(:,ib1,ik,:)
    enddo
    call mpi_allreduce(MPI_IN_PLACE,zat_l(:,:,:,1),maxval(np_g1k)*neg*kimg,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_kg_world,ierr)
    do ib1 = 1, np_e ! MPI
       ib = ib1 ! MPI
#ifdef __TIMER_COMM__
       call m_ES_WF_in_Rspace_3D(ik,ib,ib,ibsize,lsize,wk_bfft_l,0)
#else
       call m_ES_WF_in_Rspace_3D(ik,ib,ib,ibsize,lsize,wk_bfft_l)
#endif
#ifdef FFT_3D_DIVISION
         call m_FFT_Vlocal_W_3DIV_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_x(myrank_g))
         call m_FFT_Direct_3DIV_3D(nfout,wk_bfft_l,lsize,ibsize)
#else
         if (sw_fft_xzy > 0) then
            call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_y(myrank_g))
            call m_FFT_Direct_3D(nfout,wk_bfft_l,lsize,ibsize)
         else
            call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
            if(sw_serial_fft == ON) then
              call m_ES_WF_2D(ik,wk_bfft_l,ib2,ib1,ibsize,lsize,DIRECT)
            else
              call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
            endif
         end if
#endif
       call map_fft_to_WF_3D(ik,lsize,ibsize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
       if(kimg == 1) then
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             dr1 = zaj_l(iadd,ib,ik,1)
             dr2 = bfft_l(iadd,1)*denom
             zaj_l(iadd,ib,ik,1)= ekin_l(iadd)*dr1+dr2
          enddo
       else
          do ii=ista_g1k(ik),iend_g1k(ik)
             iadd = ii - ista_g1k(ik) + 1
             dr1  = zaj_l(iadd,ib,ik,1)
             di1  = zaj_l(iadd,ib,ik,kimg)
             zaj_l(iadd,ib,ik,1)    = ekin_l(iadd)*dr1+bfft_l(2*iadd-1,1)*denom
             zaj_l(iadd,ib,ik,kimg) = ekin_l(iadd)*di1+bfft_l(2*iadd,  1)*denom
          enddo
       endif
#ifdef SAVE_FFT_TIMES
       if(sw_save_fft == ON) status_saved_phifftr(ib,ik) = OLD
#endif
    enddo
!( tenchi ) (zah_l <- zaj_l)
    zah_l(:,:,:,1) = zaj_l(:,:,ik,:)

! (make matrix elements )
! parallel loop
    ! <n|T+Vloc|m> G-wise parallel
    w1hw2 = 0.d0
    w1sw2 = 0.d0
!   do ib2 = 1,nsize_sb_now
    do ib2_ = 1, np_e
       ib2 = neg_g(ib2_)
       ip0b = ib2*(ib2-1)/2
       do ib1 = 1,ib2
          ip0 = ip0b + ib1
          if(kimg == 1) then
             do ii = ista_g1k(ik), iend_g1k(ik) ! MPI
                iadd = ii - ista_g1k(ik) + 1
                hr2 = zah_l(iadd,ib2_,1,1)
                dr2 = zat_l(iadd,ib2, 1,1)
                dr1 = zat_l(iadd,ib1, 1,1)
                w1hw2(ip0) = w1hw2(ip0) + dr1*hr2
                w1sw2(ip0) = w1sw2(ip0) + dr1*dr2
             end do
          else
             if(k_symmetry(ik) == GAMMA) then
                do ii = max(ista_g1k(ik),2), iend_g1k(ik) ! MPI
                   iadd = ii - ista_g1k(ik) + 1
                   hr2 = zah_l(iadd,ib2_,1,1) ! MPI
                   hi2 = zah_l(iadd,ib2_,2,1) ! MPI
                   dr2 = zat_l(iadd,ib2, 1,1) ! MPI
                   di2 = zat_l(iadd,ib2, 2,1) ! MPI
                   dr1 = zat_l(iadd,ib1, 1,1) ! MPI
                   di1 = zat_l(iadd,ib1, 2,1) ! MPI
                   w1hw2(ip0) =w1hw2(ip0)+(dr1*hr2+di1*hi2)*2.d0
                   w1sw2(ip0) =w1sw2(ip0)+(dr1*dr2+di1*di2)*2.d0
                end do
                if(ista_g1k(ik) == 1) then
                   hr2 = zah_l(1,ib2_,1,1) ! MPI
                   hi2 = zah_l(1,ib2_,2,1) ! MPI
                   dr2 = zat_l(1,ib2, 1,1) ! MPI
                   di2 = zat_l(1,ib2, 2,1) ! MPI
                   dr1 = zat_l(1,ib1, 1,1) ! MPI
                   di1 = zat_l(1,ib1, 2,1) ! MPI
                   w1hw2(ip0) =w1hw2(ip0)+dr1*hr2+di1*hi2
                   w1sw2(ip0) =w1sw2(ip0)+dr1*dr2+di1*di2
                end if
             else
                do ii = ista_g1k(ik), iend_g1k(ik) ! MPI
                   iadd = ii - ista_g1k(ik) + 1
                   hr2 = zah_l(iadd,ib2_,1,1) ! MPI
                   hi2 = zah_l(iadd,ib2_,2,1) ! MPI
                   dr2 = zat_l(iadd,ib2, 1,1) ! MPI
                   di2 = zat_l(iadd,ib2, 2,1) ! MPI
                   dr1 = zat_l(iadd,ib1, 1,1) ! MPI
                   di1 = zat_l(iadd,ib1, 2,1) ! MPI
                   w1hw2(2*ip0-1) =w1hw2(2*ip0-1)+dr1*hr2+di1*hi2
                   w1hw2(2*ip0  ) =w1hw2(2*ip0  )+dr1*hi2-di1*hr2
                   w1sw2(2*ip0-1) =w1sw2(2*ip0-1)+dr1*dr2+di1*di2
                   w1sw2(2*ip0  ) =w1sw2(2*ip0  )+dr1*di2-di1*dr2
                end do
             end if
          end if
       end do
    end do
    if(ipridavidson >= 3) call wd_w1hw2(" -- w1hw2 without nl part--")
    ! <n|Vnl|m> G-wise parallel
    if(myrank_g == 0) then
    call add_nonlocal_part ! w1hw2 = w1hw2 + w1Vnlw2
                           ! w1sw2 = w1sw2 + w1qw2
    endif
    if(ipridavidson >= 3) call wd_w1hw2(" -- w1hw2 with nl part--")

!! (spread sum of w1hw2 and w1sw2)
    if(npes > 1) then
       w1hw2_mpi = 0.d0
       w1sw2_mpi = 0.d0
       nshift = 0
       ndata = nsize_mt_now*kimg_t
       call mpi_allreduce(w1hw2(nshift+1),w1hw2_mpi,ndata,mpi_double_precision,mpi_sum,mpi_k_world(myrank_k),ierr) ! MPI
       w1hw2(nshift+1:nshift+ndata) = w1hw2_mpi(1:ndata) ! MPI
       call mpi_allreduce(w1sw2(nshift+1),w1sw2_mpi,ndata,mpi_double_precision,mpi_sum,mpi_k_world(myrank_k),ierr) ! MPI
       w1sw2(nshift+1:nshift+ndata) = w1sw2_mpi(1:ndata) ! MPI
    end if

    if(ipridavidson >= 2) call wd_w1hw2(" -- just after making w1hw2 --")
    if(ipridavidson >= 2) then
       write(nfout,*) 'neordr for ik = ',ik
       write(nfout,9002) (neordr(ib1,ik),ib1=1,neg)
       write(nfout,*) 'nrvf_ordr for ik = ',ik
       write(nfout,9002) (nrvf_ordr(ib1,ik),ib1=1,neg)
       write(nfout,*) 'eig'
       write(nfout,'(5x,10f8.4)') (eko_l(ib1,ik),ib1=1,np_e)
    endif
9002 format(5x,10i8)

!! (Diagonalization )  !!

    if(kimg_t == 1) then
       call dspgvx_driver(eig,vec,w1hw2,w1sw2,ierr_diag)
    else
       call zhpgvx_driver(eig,vec,w1hw2,w1sw2,ierr_diag)
    endif

    if(ierr_diag /= 0) then
       zaj_l(:,:,ik,:) = zaj_l_backup(:,:,:)
#ifdef SAVE_FFT_TIMES
       if(sw_save_fft == ON) status_saved_phifftr(:,ik) = OLD
#endif
       do ib1 = 1, np_e
          eko_l(ib1,ik)=eko_d(neg_g(ib1))
       end do
       if(ipridavidson >= 2) then
          write(nfout,*) '** Mod Davidson SR error **'
       end if
    else

!!$       if(ipridavidson >= 2) then
       if(ipri0 >= 2) then
          write(nfout,*) 'eko_d for ik = ',ik
          write(nfout,9001) (eko_d(ib),ib=1,neg)
          write(nfout,*) 'eig for ik = ',ik
          write(nfout,9001) (eig(ib),ib=1,neg)
          call wd_w1hw2(" -- after diagonalization --")
!sum eko
          dr1=0.d0;dr2=0.d0
          do ib1=1,np_e
             ib1to = neordr(neg_g(ib1),ik)
             if(map_e(ib1to) == myrank_e) dr1=dr1+eko_l(ib1,ik) ! MPI
             dr2=dr2+eig(neg_g(ib1))
          enddo
          call mpi_allreduce(MPI_IN_PLACE,dr1,1,mpi_double_precision,mpi_sum,mpi_kg_world,ierr) ! MPI
          write(nfout,'(" sum of eko_l, eig, abs diff =",3e25.10)') dr1,dr2,abs(dr2-dr1)
       endif
!! (subspace rotation) !!
       call subspace_rotation ! vec,zat_l -> zat_l
!( tenchi ) (zaj_l <- zat_l)
       if(ipridavidson>=2 .and. ik==1) write(nfout,'(" !### zaj_l is new,  bfft is old")')
       do ib1 = 1, np_e
          zaj_l(:,ib1,ik,:) = zat_l(:,neg_g(ib1),:,1)
#ifdef SAVE_FFT_TIMES
          if(sw_save_fft == ON) status_saved_phifftr(ib1,ik) = OLD
#endif
       enddo
       zaj_l_backup(:,:,:) = zaj_l(:,:,ik,:)
!! (eko_l)
       do ib1 = 1, np_e
          eko_l(ib1,ik)=eig(neg_g(ib1))
       end do
       if(ipridavidson >= 2) then
          eko2 = sum(eig(1:neg))
          write(nfout,1201) ik,eko1,ekod,eko2

          write(nfout,*) 'eko_l'
          write(nfout,9001) (eko_l(ib1,ik),ib1=1,np_e)
       endif
1201 format(' %% for ik = ',i4,4x,' eko1&ekod&eko2 = ',3f14.7)
9001 format(5x,6f12.5)
!! (neordr & nrvf_ordr)

    end if

    neordr(1:neg,ik) = (/(ib1,ib1=1,neg)/)
    nrvf_ordr(1:neg,ik) = (/(ib1,ib1=1,neg)/)

! (deallocate)
    deallocate(eko_d)
    deallocate(eig)
    deallocate(vec)
    deallocate(occup)

    call tstatc0_end(id_sname)

  contains
    subroutine get_ipri0(ipri_in, ipri_out)
      integer, intent(in)  :: ipri_in
      integer, intent(out) :: ipri_out
      if(npes > 1) then
         if(mype == 0) ipri_out = ipri_in
         call mpi_bcast(ipri_out,1,mpi_integer,0,mpi_k_world(myrank_k),ierr)
      else
         ipri_out = ipri_in
      end if
    end subroutine get_ipri0

    subroutine wd_w1hw2(somecomment)
      character(len=*), intent(in) :: somecomment
      integer :: ib1, ib2, neg_wd, nsb_wd
      write(nfout,'(a35)') somecomment
      write(nfout,*) 'w1hw2 for ik = ',ik
      neg_wd = 8
      nsb_wd = 8
      if(neg_wd > neg) neg_wd = neg
      if(nsb_wd > nsize_sb_now) nsb_wd = nsize_sb_now
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      else
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(2*ip0-1),ip0=ip0b+1,ip0b+ib2)
         end do
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(2*ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      end if
      write(nfout,*) 'w1sw2 for ik = ',ik
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      else
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(2*ip0-1),ip0=ip0b+1,ip0b+ib2)
         end do
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(2*ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      end if
9001  format(5x,9f12.5)
      write(nfout,*) 'eko_l for ik = ',ik
      write(nfout,9001) (eko_d(neordr(ib1,ik)),ib1=1,neg_wd)
    end subroutine wd_w1hw2

    subroutine add_nonlocal_part
      integer :: ip,ib1,ib2
      integer       :: ia, lmt1, lmt2, it, p, s, ib
      real(kind=DP) :: facv,facq,vr,vi,qr,qi
      real(kind=DP) :: tmpr,tmpi
! ========================== added by K. Tagami ========== 11.0
#ifdef forsafe
      integer :: ipaw_tmp
#endif
! ======================================================== 11.0
      do ib2 = 1,nsize_sb_now
         ip0b = ib2*(ib2-1)/2
         do ib1 = 1,ib2
            ip0 = ip0b + ib1
            if(mod(ip0-1,nrank_e)/=myrank_e) cycle
            if(kimg_t==1) then
               vr=0.d0
               qr=0.d0
            else
               vr=0.d0
               vi=0.d0
               qr=0.d0
               qi=0.d0
            end if
            do ia = 1, natm
               it = ityp(ia)
! ========================== added by K. Tagami =================== 11.0
#ifdef forsafe
               ipaw_tmp = ipaw(it)
#endif
! ================================================================= 11.0
               do lmt1 = 1, ilmt(it)
                  p = lmta(lmt1,ia)
!OCL NOPREEX
                  do lmt2 = 1, ilmt(it)
                     s = lmta(lmt2,ia)
! ========================== modified by K. Tagami =================== 11.0
!                     if(ipaw(it)==0)then
#ifdef forsafe
                     if ( ipaw_tmp == 0 ) then
#else
                     if (ipaw(it)==0 ) then
#endif
! ================================================================= 11.0
                        facv   = iwei(ia)*(dion(lmt1,lmt2,it) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     else
                        facv   = iwei(ia)*(dion_paw(lmt1,lmt2,ispin,ia) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     endif
                     facq   = iwei(ia)*q(lmt1,lmt2,it)
                     if(kimg==1) then
                        tmpr = fsr(ib1,p,1)*fsr(ib2,s,1)&
                    &        + fsi(ib1,p,1)*fsi(ib2,s,1)
                        vr = vr + facv*tmpr
                        qr = qr + facq*tmpr
                     else
                        if(k_symmetry(ik) == GAMMA) then
                           tmpr = fsr(ib1,p,1)*fsr(ib2,s,1)
                           vr = vr + facv*tmpr
                           qr = qr + facq*tmpr
                        else
                           tmpr = fsr(ib1,p,1)*fsr(ib2,s,1)&
                    &        + fsi(ib1,p,1)*fsi(ib2,s,1)
                           tmpi = fsr(ib1,p,1)*fsi(ib2,s,1)&
                    &        - fsi(ib1,p,1)*fsr(ib2,s,1)
                           vr = vr + facv*tmpr
                           vi = vi + facv*tmpi
                           qr = qr + facq*tmpr
                           qi = qi + facq*tmpi
                        end if
                     end if
                  end do
               end do
            end do
            if(kimg_t==1) then
               w1hw2(ip0) = w1hw2(ip0) + vr
               w1sw2(ip0) = w1sw2(ip0) + qr
            else
               w1hw2(2*ip0-1) = w1hw2(2*ip0-1) + vr
               w1hw2(2*ip0  ) = w1hw2(2*ip0  ) + vi
               w1sw2(2*ip0-1) = w1sw2(2*ip0-1) + qr
               w1sw2(2*ip0  ) = w1sw2(2*ip0  ) + qi
            end if
         end do
      end do
    end subroutine add_nonlocal_part

    subroutine subspace_rotation
      integer :: ib1,ib2,ibb2,iadd
!!$      real(kind=DP), dimension(np_g1k_x,neg,kimg) :: zaj_wk
      real(kind=DP), allocatable, dimension(:,:,:) :: zaj_wk
!     allocate(zaj_wk(np_g1k_x,neg,kimg))
      allocate(zaj_wk(maxval(np_g1k),np_e,kimg))

      zaj_wk(:,:,:) = 0.d0
      if(kimg==1) then
         do ib1=1,np_e
            do ib2=1,neg
               do ii=ista_g1k(ik),iend_g1k(ik)
                  iadd = ii - ista_g1k(ik) + 1
                  zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + zat_l(iadd,ib2,kimg,1)*vec(ib2,neg_g(ib1))
               end do
            end do
         end do
      else
         if(k_symmetry(ik) == GAMMA) then
            do ib1=1,np_e
               do ib2=1,neg
                  hr2=vec(ib2,neg_g(ib1))
                  do ii=ista_g1k(ik),iend_g1k(ik)
                     iadd = ii - ista_g1k(ik) + 1
                     dr1=zat_l(iadd,ib2,1   ,1)
                     di1=zat_l(iadd,ib2,kimg,1)
                     zaj_wk(iadd,ib1,1   ) = zaj_wk(iadd,ib1,1   ) + dr1*hr2
                     zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + di1*hr2
                  end do
               end do
            end do
         else
            do ib1=1,np_e
               do ib2=1,neg
                  hr2=vec(2*ib2-1,neg_g(ib1))
                  hi2=vec(2*ib2  ,neg_g(ib1))
                  do ii=ista_g1k(ik),iend_g1k(ik)
                     iadd = ii - ista_g1k(ik) + 1
                     dr1=zat_l(iadd,ib2,1   ,1)
                     di1=zat_l(iadd,ib2,kimg,1)
                     zaj_wk(iadd,ib1,1   ) = zaj_wk(iadd,ib1,1   ) + dr1*hr2 - di1*hi2
                     zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + dr1*hi2 + di1*hr2
                  end do
               end do
            end do
         end if
      end if
      zat_l(:,:,:,1) = 0.0d0
      do ib1 = 1, np_e
         zat_l(:,neg_g(ib1),:,1) = zaj_wk(:,ib1,:)
      enddo
      call mpi_allreduce(MPI_IN_PLACE,zat_l(:,:,:,1),maxval(np_g1k)*neg*kimg,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_kg_world,ierr)
      deallocate(zaj_wk)
    end subroutine subspace_rotation

  end subroutine evolve_WFs_in_subspace_sr_3D


  subroutine dspgvx_driver(eig,vec,w1hw2,w1sw2,ierr,nel)
    real(kind=DP), intent(out) ,dimension(nsize_sb_now) :: eig
    real(kind=DP), intent(out) ,dimension(nsize_sb_now*neg) :: vec
    real(kind=DP), intent(inout) ,dimension(nsize_mt_now) :: w1hw2,w1sw2
    integer, intent(out) :: ierr
    integer, intent(in), optional :: nel

    integer :: ITYPE
    character(len=1) :: JOBZ,RANGE,UPLO
    integer :: il,iu
    real(kind=DP),allocatable,dimension(:) :: work_lapack
    integer, allocatable, dimension(:) :: iwork_lapack, ifail_lapack
    real(kind=DP) :: vl,vu,abstol
    integer :: info,m
    real(kind=DP), external :: dlamch
    !!$real(kind=DP), dimension(nsize_mt_now) :: ap,bp
    real(kind=DP), allocatable, dimension(:) :: ap,bp

    allocate(ap(nsize_mt_now),bp(nsize_mt_now))
    abstol = 2*dlamch('S')

    il=1; iu=neg
    if(present(nel)) iu=nel
!(LAPACK)  ITYPE = 1:  A*x = (lambda)*B*x, 2:  A*B*x = (lambda)*x, 3:  B*A*x = (lambda)*x
    ITYPE = 1
!(LAPACK)  JOBZ = N : eigenvalue, V : eigenvalue + eigenvector
    JOBZ = 'V'
!(LAPACK)  RANGE = A : all eigenvalues, V: all eigenvalues in (VL, VU], I: the IL-th through IU-th eigenvalues
    RANGE = 'I'
!(LAPACK)  UPLO = U : upper triangle matrix,  L : lower triangle matrix
    UPLO = 'U'
    allocate(work_lapack(8*nsize_sb_now))
    allocate(iwork_lapack(5*nsize_sb_now))
    allocate(ifail_lapack(nsize_sb_now))

    ap = w1hw2
    bp = w1sw2

    call dspgvx(ITYPE,JOBZ,RANGE,UPLO,nsize_sb_now,ap,bp &
    &          ,vl,vu,il,iu,abstol,m,eig,vec,nsize_sb_now &
    &          ,work_lapack,iwork_lapack,ifail_lapack,info)

    if(ipridavidson >=2 .and. info/=0) then
       write(nfout,*) "debug(dspgvx) info=",info
       write(nfout,*) "debug(dspgvx) ifail"
       write(nfout,'(8(1x,i3))') ifail_lapack
       write(nfout,*) "debug(dspgvx) eig"
       write(nfout,'(8(1x,f10.5))') eig
    end if

    deallocate(work_lapack)
    deallocate(iwork_lapack)
    deallocate(ifail_lapack)

    deallocate(ap,bp)

    if(info/=0) then
       !!write(nfout,*) "dspgvx: info=",info
       !!stop 'error in dspgvx_driver'
       ierr = 1
    else
       ierr = 0
    end if

  end subroutine dspgvx_driver

  subroutine dspgvx_driver_loc(eig,vec,w1hw2,w1sw2,ierr,nel)
    integer, intent(in) :: nel
    real(kind=DP), intent(out) ,dimension(nsize_sb_now) :: eig
    real(kind=DP), intent(out) ,dimension(nsize_sb_now*nel) :: vec
    real(kind=DP), intent(inout) ,dimension(nsize_mt_now) :: w1hw2,w1sw2
    integer, intent(out) :: ierr

    integer :: ITYPE
    character(len=1) :: JOBZ,RANGE,UPLO
    integer :: il,iu
    real(kind=DP),allocatable,dimension(:) :: work_lapack
    integer, allocatable, dimension(:) :: iwork_lapack, ifail_lapack
    real(kind=DP) :: vl,vu,abstol
    integer :: info,m
    real(kind=DP), external :: dlamch
    !!$real(kind=DP), dimension(nsize_mt_now) :: ap,bp
    real(kind=DP), allocatable, dimension(:) :: ap,bp

    allocate(ap(nsize_mt_now),bp(nsize_mt_now))
    abstol = 2*dlamch('S')

    il=1; iu=nel
!(LAPACK)  ITYPE = 1:  A*x = (lambda)*B*x, 2:  A*B*x = (lambda)*x, 3:  B*A*x = (lambda)*x
    ITYPE = 1
!(LAPACK)  JOBZ = N : eigenvalue, V : eigenvalue + eigenvector
    JOBZ = 'V'
!(LAPACK)  RANGE = A : all eigenvalues, V: all eigenvalues in (VL, VU], I: the IL-th through IU-th eigenvalues
    RANGE = 'I'
!(LAPACK)  UPLO = U : upper triangle matrix,  L : lower triangle matrix
    UPLO = 'U'
    allocate(work_lapack(8*nsize_sb_now))
    allocate(iwork_lapack(5*nsize_sb_now))
    allocate(ifail_lapack(nsize_sb_now))

    ap = w1hw2
    bp = w1sw2

    call dspgvx(ITYPE,JOBZ,RANGE,UPLO,nsize_sb_now,ap,bp &
    &          ,vl,vu,il,iu,abstol,m,eig,vec,nsize_sb_now &
    &          ,work_lapack,iwork_lapack,ifail_lapack,info)

!    if(ipridavidson >=2 .and. info/=0) then
    if(info/=0) then
       write(nfout,*) "debug(dspgvx) info=",info
       write(nfout,*) "debug(dspgvx) ifail"
       write(nfout,'(8(1x,i3))') ifail_lapack
       write(nfout,*) "debug(dspgvx) eig"
       write(nfout,'(8(1x,f10.5))') eig
    end if

    deallocate(work_lapack)
    deallocate(iwork_lapack)
    deallocate(ifail_lapack)

    deallocate(ap,bp)

    if(info/=0) then
       !!write(nfout,*) "dspgvx: info=",info
       !!stop 'error in dspgvx_driver'
       ierr = 1
    else
       ierr = 0
    end if

  end subroutine dspgvx_driver_loc
#ifdef NEC_TUNE
  subroutine dsygvx_driver(eig, vec, w1hw2, w1sw2, ierr, nel)
    integer, intent(in) :: nel
    real(kind=DP), intent(out),   dimension(nsize_sb_now)               :: eig
    real(kind=DP), intent(out),   dimension(nsize_sb_now*nel)           :: vec
    real(kind=DP), intent(inout), dimension(nsize_sb_now, nsize_sb_now) :: w1hw2, w1sw2
    integer, intent(out) :: ierr
    integer :: ITYPE, il, iu, lwork, info, m
    character(len=1) :: JOBZ, RANGE, UPLO
    real(kind=DP) :: vl, vu, abstol, work_tmp
    real(kind=DP), allocatable, dimension(:) :: work
    integer, allocatable, dimension(:) :: iwork, ifail
    real(kind=DP), external :: dlamch

    abstol = 2*dlamch('S')
    il = 1; iu = nel
    ITYPE = 1; JOBZ = 'V'; RANGE = 'I'; UPLO = 'U'

    allocate(iwork(5*nsize_sb_now))
    allocate(ifail(nsize_sb_now))

    call dsygvx(ITYPE, JOBZ, RANGE, UPLO, nsize_sb_now, &
   &            w1hw2, nsize_sb_now, w1sw2, nsize_sb_now, &
   &            vl, vu, il, iu, abstol, m, eig, vec, nsize_sb_now, &
   &            work_tmp, -1, iwork, ifail, info)

    lwork = int(work_tmp)
    if(lwork<8*nsize_sb_now) lwork = 8*nsize_sb_now
    allocate(work(lwork))

    call dsygvx(ITYPE, JOBZ, RANGE, UPLO, nsize_sb_now, &
   &            w1hw2, nsize_sb_now, w1sw2, nsize_sb_now, &
   &            vl, vu, il, iu, abstol, m, eig, vec, nsize_sb_now, &
   &            work, lwork, iwork, ifail, info)

    if(info /= 0) then
       write(nfout,*) "debug(dsygvx) info=",info
       write(nfout,*) "debug(dsygvx) ifail"
       write(nfout,'(8(1x,i3))') ifail
       write(nfout,*) "debug(dsygvx) eig"
       write(nfout,'(8(1x,f10.5))') eig
    end if

    deallocate(work)
    deallocate(iwork)
    deallocate(ifail)

    if(info /= 0) then
       ierr = 1
    else
       ierr = 0
    end if

  end subroutine dsygvx_driver
#endif

  subroutine zhpgvx_driver(eig,vec,w1hw2,w1sw2,ierr,nel)
    real(kind=DP), intent(out) ,dimension(nsize_sb_now) :: eig
    real(kind=DP), intent(out) ,dimension(nsize_sb_now*kimg,nsize_sb_now) :: vec
    real(kind=DP), intent(in) ,dimension(nsize_mt_now*kimg) :: w1hw2,w1sw2
    integer, intent(out) :: ierr
    integer, intent(in), optional :: nel
    integer :: ITYPE
    character(len=1) :: JOBZ,RANGE,UPLO
    integer :: il,iu
    real(kind=DP),allocatable,dimension(:) :: work_lapack
    real(kind=DP),allocatable,dimension(:) :: rwork_lapack
    integer, allocatable, dimension(:) :: iwork_lapack, ifail_lapack
    real(kind=DP) :: vl,vu,abstol
    integer :: info,m
    real(kind=DP), external :: dlamch
!!$    real(kind=DP), dimension(nsize_mt_now*kimg) :: ap,bp
    real(kind=DP), allocatable, dimension(:) :: ap,bp
    integer :: ib,i

    abstol = 2*dlamch('S')

    il=1; iu=neg
    if(present(nel)) iu=nel
!(LAPACK)  ITYPE = 1:  A*x = (lambda)*B*x, 2:  A*B*x = (lambda)*x, 3:  B*A*x = (lambda)*x
    ITYPE = 1
!(LAPACK)  JOBZ = N : eigenvalue, V : eigenvalue + eigenvector
    JOBZ = 'V'
!(LAPACK)  RANGE = A : all eigenvalues, V: all eigenvalues in (VL, VU], I: the IL-th through IU-th eigenvalues
    RANGE = 'I'
!(LAPACK)  UPLO = U : upper triangle matrix,  L : lower triangle matrix
    UPLO = 'U'
    allocate(work_lapack(2*nsize_sb_now*kimg)); work_lapack = 0.d0
    allocate(rwork_lapack(7*nsize_sb_now)); rwork_lapack = 0.d0
    allocate(iwork_lapack(5*nsize_sb_now)); iwork_lapack = 0
    allocate(ifail_lapack(nsize_sb_now)); ifail_lapack=0
    allocate(ap(nsize_mt_now*kimg));ap=0.d0
    allocate(bp(nsize_mt_now*kimg));bp=0.d0

    if(ipridavidson >=3 ) then
       write(nfout,*) "debug(zhpgvx) i,w1hw2,w1sw2"
       do ib=1,nsize_sb_now
          i = ib*(ib-1)/2 + ib
          write(nfout,*) ib,w1hw2(2*i-1),w1sw2(2*i-1)
       end do
       ap = w1sw2
       write(nfout,*) "debug(zhpgvx) i,ap,w1sw2"
       do i=1,nsize_mt_now
          write(nfout,*) i,ap(2*i-1),w1sw2(2*i-1)
       end do
       call zhpevx('N','A','U',nsize_sb_now,ap,vl,vu,il,iu,abstol,m &
    &          ,eig,vec,nsize_sb_now &
    &          ,work_lapack,rwork_lapack,iwork_lapack,ifail_lapack,info)
       write(nfout,*) "debug(zhpgvx) eig of w1sw2"
       do ib=1,nsize_sb_now
          write(nfout,*) ib,eig(ib)
       end do
    end if

    ap = w1hw2
    bp = w1sw2

    call zhpgvx(ITYPE,JOBZ,RANGE,UPLO,nsize_sb_now,ap,bp &
    &          ,vl,vu,il,iu,abstol,m,eig,vec,nsize_sb_now &
    &          ,work_lapack,rwork_lapack,iwork_lapack,ifail_lapack,info)

!    if(ipridavidson >=2 .and. info/=0) then
    if(info/=0) then
       write(nfout,*) "debug(zhpgvx) info=",info
       write(nfout,*) "debug(zhpgvx) ifail"
       write(nfout,'(8(1x,i3))') ifail_lapack
       write(nfout,*) "debug(zhpgvx) eig"
       write(nfout,'(8(1x,f10.5))') eig
    end if

    deallocate(work_lapack)
    deallocate(rwork_lapack)
    deallocate(iwork_lapack)
    deallocate(ifail_lapack)
    deallocate(ap,bp)

    if(info/=0) then
       !!write(nfout,*) "zhpgvx: info=",info
       !!stop 'error in zhpgvx_driver'
       ierr = 1
    else
       ierr = 0
    end if

  end subroutine zhpgvx_driver
#ifdef NEC_TUNE
  subroutine zhegvx_driver(eig, vec, w1hw2, w1sw2, ierr, nel)
    integer, intent(in):: nel
    real(kind=DP), intent(out),   dimension(nsize_sb_now) :: eig
    real(kind=DP), intent(out),   dimension(nsize_sb_now*kimg, nel) :: vec
    real(kind=DP), intent(inout), dimension(nsize_sb_now*kimg, nsize_sb_now) :: w1hw2, w1sw2
    integer, intent(out) :: ierr
    integer :: ITYPE, il, iu, lwork, info, m
    character(len=1) :: JOBZ, RANGE, UPLO
    complex(kind=DP) :: work_tmp
    complex(kind=DP), allocatable, dimension(:) :: work
    real(kind=DP), allocatable, dimension(:) :: rwork
    integer, allocatable, dimension(:) :: iwork, ifail
    real(kind=DP) :: vl, vu, abstol
    real(kind=DP), external :: dlamch

    abstol = 2*dlamch('S')
    il = 1; iu = nel
    ITYPE = 1; JOBZ = 'V'; RANGE = 'I'; UPLO = 'U'

    allocate(rwork(7*nsize_sb_now))
    allocate(iwork(5*nsize_sb_now))
    allocate(ifail(nsize_sb_now))

    call zhegvx(ITYPE, JOBZ, RANGE, UPLO, nsize_sb_now, &
   &            w1hw2, nsize_sb_now, w1sw2, nsize_sb_now, &
   &            vl, vu, il, iu, abstol, m, eig, vec, nsize_sb_now, &
   &            work_tmp, -1, rwork, iwork, ifail, info)

    lwork = int(real(work_tmp))
    if(lwork<8*nsize_sb_now) lwork = 8*nsize_sb_now
    allocate(work(lwork))

    call zhegvx(ITYPE, JOBZ, RANGE, UPLO, nsize_sb_now, &
   &            w1hw2, nsize_sb_now, w1sw2, nsize_sb_now, &
   &            vl, vu, il, iu, abstol, m, eig, vec, nsize_sb_now, &
   &            work, lwork, rwork, iwork, ifail, info)

    if(info /= 0) then
       write(nfout,*) "debug(zhegvx) info=",info
       write(nfout,*) "debug(zhegvx) ifail"
       write(nfout,'(8(1x,i3))') ifail
       write(nfout,*) "debug(zhegvx) eig"
       write(nfout,'(8(1x,f10.5))') eig
    end if

    deallocate(work)
    deallocate(rwork)
    deallocate(iwork)
    deallocate(ifail)

    if(info /= 0) then
       ierr = 1
    else
       ierr = 0
    end if

  end subroutine zhegvx_driver
#endif

  subroutine m_ESmddavid_Renew_WF(nfout,precon)
    integer, intent(in) :: nfout,precon
    integer             :: ispin, ik, iksnl, switch_of_eko_part
    integer :: iblock,itot
    real(kind=DP), allocatable, dimension(:) ::  afft, bfft
    real(kind=DP), allocatable, dimension(:) :: ekin,p
    real(kind=DP), allocatable, dimension(:) :: afft_l
    real(kind=DP), allocatable, dimension(:,:) :: wk_bfft_l
    real(kind=DP), allocatable, dimension(:,:) :: bfft_l
    integer :: lsize, ibsize, isrsize, fft_l_size
    real(kind=DP), allocatable, dimension(:) :: ekin_l,p_l
    logical :: frestart
    integer :: iblock_now, itot_now, ipri0
    integer :: n_unconv
    real(kind=DP), allocatable, dimension(:,:,:) :: vexx
    real(kind=DP) :: exx
    integer :: ng
#ifdef MPI_FFTW
    integer(C_INTPTR_T)  :: local_n, local_n_offset, alloc_local, lx, ly, lz, mx,my,mz
#endif

!!$    integer :: id_sname = -1

    integer, save :: print_ibsize = ON
#ifdef MPI_FFTW
    lsize = max(maxval(nel_fft_x(:)),maxval(nel_fft_y(:)),maxval(nel_fft_z(:)))
    if(sw_mpi_fftw==ON) then
       lx = fft_box_size_WF(1,0)
       ly = fft_box_size_WF(2,0)
       lz = fft_box_size_WF(3,0)
       if(kimg==2) then
          alloc_local = fftw_mpi_local_size_3d(ly,lz,lx,mpi_ke_world,local_n,local_n_offset)
       else
          alloc_local = fftw_mpi_local_size_3d(ly,lz,lx/2,mpi_ke_world,local_n,local_n_offset)
       endif
       lsize = local_n*lx*lz
       allocate(afft_mpifftw_vlocal(lx,lz,local_n));afft_mpifftw_vlocal=0.d0
       if ( use_metagga .and. vtau_exists ) then
          allocate(cfft_mpifftw_vlocal(lx,lz,local_n));cfft_mpifftw_vlocal=0.d0
       endif
    else
       allocate(afft_l(lsize*kimg), stat=ierr)
       if ( use_metagga .and. vtau_exists ) allocate(cfft_l(lsize*kimg), stat=ierr)
    endif
#else
#ifdef FFT_3D_DIVISION
    lsize = fft_X_x_nel*fft_X_y_nel*fft_X_z_nel
    allocate(afft_l(lsize*2   ), stat=ierr)
    if ( use_metagga .and. vtau_exists ) allocate(cfft_l(lsize*2), stat=ierr)
#else
    lsize = max(maxval(nel_fft_x(:)),maxval(nel_fft_y(:)),maxval(nel_fft_z(:)))
    allocate(afft_l(lsize*kimg), stat=ierr)
    if ( use_metagga .and. vtau_exists ) allocate(cfft_l(lsize*kimg), stat=ierr)
#endif
#endif
    if(ierr /= 0) then
       write(nfout,*)' m_ESmddavid_Renew_WF: Not allocated afft_l array'
       call flush(nfout)
       call mpi_abort(mpi_comm_world, 201, ierr)
    endif
    ibsize = 1
! === FFT Marge. by T. Yamasaki after T.Kato's prescrition in m_ES_WF_by_SDorCG.F90. 2013/02/21 ==
    if (nblocksize_fftw_is_given) then
       ibsize = nblocksize_fftw
       if (ibsize < 1) ibsize = 1
    endif
    if(ipridavidson>=2 .and. print_ibsize == ON) then
       write(nfout,'(" !! ibsize in m_ESmddavid_Renew_WF = ",i8)') ibsize
       print_ibsize = OFF
    end if
!!$! ========================================================================================


    allocate(ekin_l(maxval(np_g1k)),p_l(maxval(np_g1k)))
!    call m_ES_alloc_scss_etc_3D()
!    allocate(afft(nfft)); allocate(bfft(nfft))

    call allocate_matrix_3D
    if(sw_hybrid_functional==ON)then
       call m_ES_EXX_gather_valence_states(nfout)
    endif
#ifdef MPI_FFTW
    if(sw_mpi_fftw==ON)then
      allocate(wk_bfft_l(1,1))
    else
      allocate(wk_bfft_l(lsize*kimg,ibsize))
    endif
#else
    allocate(wk_bfft_l(lsize*kimg,ibsize))
#endif
    allocate(bfft_l(lsize*2   ,ibsize))
!    do ispin = 1, nspin, (af+1)
    do ispin = ista_spin, iend_spin, (af+1)

#ifdef MPI_FFTW
       if(sw_mpi_fftw==ON) then
          call m_ES_Vlocal_in_Rspace_mpifftw3d(ispin,lx,local_n,lz,afft_mpifftw_vlocal)
          if ( use_metagga .and. vtau_exists ) then
             call m_ES_Vlocal_in_Rspace_mpifftw3d( ispin, lx, local_n, lz, &
                  &               cfft_mpifftw_vlocal, vtau_l )
          endif
       else
          call m_ES_Vlocal_in_Rspace_3D(ispin,afft_l,lsize,1,OFF)      ! (ptfft1) vlhxc_l->afft
          if ( use_metagga .and. vtau_exists ) then
             call m_ES_Vlocal_in_Rspace_3D( ispin, cfft_l, lsize, 1, OFF, vtau_l ) ! r space
          endif
       endif
#else
       call m_ES_Vlocal_in_Rspace_3D(ispin,afft_l,lsize,1,OFF)      ! (ptfft1) vlhxc_l->afft
       if ( use_metagga .and. vtau_exists ) then
          call m_ES_Vlocal_in_Rspace_3D( ispin, cfft_l, lsize, 1, OFF, vtau_l ) ! r space
       endif
#endif
!!$       call tstatc0_begin('m_ESmddavid_Renew_WF ', id_sname,1)
       do ik = ispin, kv3-nspin+ispin, nspin
          if(map_k(ik) /= myrank_k) cycle          ! MPI
          isrsize = min(lsize,mp_g1k(ik))
#ifdef MPI_FFTW
          if(sw_mpi_fftw==ON) then
            fft_l_size  = lsize
            call gen_fft_to_WF_map(ik)
            call m_ES_fftbox_map(ik)
          else
            fft_l_size  = nel_fft_x(myrank_g)
          endif
#else
          fft_l_size  = nel_fft_x(myrank_g)
#endif
          if (ierr /= 0) then
             write(nfout,*)' m_ESmddavid_Renew_WF:  Not allocate '
             call flush(nfout)
             call mpi_abort(mpi_comm_world, 205, ierr)
          endif
          iksnl = (ik-1)/nspin + 1
          call allocate_t_matrix_3D(ik) ! alloc(zat_l,zah_l,w1hw2,w1sw2,wfsd_l,bsd[ri]_l)
          call m_pwBS_kinetic_energies(ik,vkxyz,ekin_l) ! (diakin) ->ekin
          if(.not.allocated(eko_l_old)) then
            allocate(eko_l_old(np_e,ista_k:iend_k));eko_l_old = 0.d0
          endif
          feigconv = .false.
!          call set_feigconv()
          call mpi_barrier(mpi_k_world(myrank_k),ierr)
          Loop: do itot=1,max_iter_david
             itot_now = itot
             call m_ES_Vnonlocal_W_3D(ik,iksnl,ispin,switch_of_eko_part=ON,map=feigconv) ! -> vnlph_l

             if ( use_metagga .and. vtau_exists ) then
#ifdef MPI_FFTW
                if ( sw_mpi_fftw == ON ) then
                   call m_ES_con_kindens_to_vnlph_mpfw( ispin, ik, lsize, &
                        &                lx, ly, lz, local_n, cfft_mpifftw_vlocal )
                else
                   call m_ES_contrib_kindens_to_vnlph( ispin, ik, lsize, cfft_l )
                endif
#else
                call m_ES_contrib_kindens_to_vnlph( ispin, ik, lsize, cfft_l )
#endif
             endif

             if(sw_hybrid_functional==ON) call m_ES_Vexx_W(ik)

             if(sw_keep_hloc_phi==ON) then
                call decide_correction_vector_3D( precon, ik, ispin, ekin_l, &
                     &       ibsize, p_l, itot, lsize )
             else
                call decide_correction_vector_3D( precon,ik,ispin,ekin_l, &
                     &       ibsize, p_l, itot, lsize, &
                     &       bfft_l=bfft_l, wk_bfft_l=wk_bfft_l, &
                     &       afft_l=afft_l )
             endif
             call prepare_Hloc_phi_3D( ik, ispin, ekin_l, afft_l, bfft_l, &
                  &          wk_bfft_l, lsize, ibsize, isrsize, fft_l_size,&
                  &          itot )

!print *,ik,itot
!write(nfout,*) 'B',feigconv
             icountocc=0
             icountuocc=0
             Block_Loop: do iblock=1,nblock
                iblock_now = iblock
                call evolve_WFs_in_subspace_3D(ik,ispin,iblock,itot,frestart)
             end do Block_Loop
             if(ipridavidson>=2) &
             write(nfout,'(a,4i8)') 'ik, iter_david, # of converged occup band, # of converged empty band ' &
             ,ik,itot,icountocc,icountuocc
!write(nfout,*) 'A',feigconv
!             call m_ES_betar_dot_WFs_4_each_k(nfout,ik)   ! -> fsr_l,fsi_l
             if(allbands_converged()) then
               if(ipridavidson>=2) &
               write(nfout,'(a,i5,a,i5)') 'kosugi/davidson iteration converged after ',itot,' iterations at kpoint ',ik
               exit
             endif
          end do Loop
          if(ipridavidson>=2) then
             write(nfout,'("Mddavid: ik=",i5," itot=",i5," subspace=",i5)') &
                                                                 ik, itot_now, nsize_sb_now
          end if
          call deallocate_t_matrix
! === DEBUG by tkato 2012/06/14 ================================================
!          deallocate(wk_bfft_l)
!          deallocate(bfft_l)
! ==============================================================================
       enddo      ! k-point loop
!!$       call tstatc0_end(id_sname)
    enddo      ! spin loop

    if ( allocated(cfft_l) ) deallocate( cfft_l )
    if ( allocated(cfft_mpifftw_vlocal) ) deallocate( cfft_mpifftw_vlocal )
    eko_l_old = eko_l

    deallocate(bfft_l)
    deallocate(wk_bfft_l)
    call deallocate_matrix
!    if(ipridavidson>=2) then
!       write(nfout,'("Modified Davidson: max_itot=",i5)') max_itot
!    end if
!
! ========================================================================================
! === NOTE: m_ES_sort_eigen_values_3D causes difference with ORG_Parallel!!! =============
! ========================================================================================
!   call m_ES_sort_eigen_values_3D()
! ========================================================================================
! ========================================================================================
! ========================================================================================
!!!  ( in case of af=1 )
!    if(af /= 0) then
!       call cp_eigen_values_for_af       !-(contained here)
!       call expand_neordr_and_nrvf_ordr  !-(contained here)
!    end if

    call get_ipri0(ipridavidson,ipri0)
    if(ipri0 >= 2) call m_ES_wd_eko_3D(nfout,mode=SCF)
!
!    deallocate(bfft);   deallocate(afft)
    call m_ES_dealloc_scss_etc()
    deallocate(ekin_l,p_l)
#ifdef MPI_FFTW
    if(sw_mpi_fftw==ON) then
      deallocate(afft_mpifftw_vlocal)
    endif
#endif
!.. deallocate(afft_l)
!deallocate(vnldi,hdiag,sdiag)
  contains

    subroutine set_feigconv()
      integer :: i
      feigconv = .false.
      do i=1,np_e
        if(occup_l(i,ik) > 0.d0) then
           if(abs(eko_l_old(i,ik)-eko_l(i,ik)) < delta_eig_occup) &
                                                  feigconv(i) = .true.
        else
           if(abs(eko_l_old(i,ik)-eko_l(i,ik)) < delta_eig_empty) &
                                                  feigconv(i) = .true.
        end if
      enddo
    end subroutine set_feigconv

    subroutine get_ipri0(ipri_in, ipri_out)
      integer, intent(in)  :: ipri_in
      integer, intent(out) :: ipri_out
      if(npes > 1) then
         if(mype == 0) ipri_out = ipri_in
         call mpi_bcast(ipri_out,1,mpi_integer,0,MPI_CommGroup,ierr)
      else
         ipri_out = ipri_in
      end if
    end subroutine get_ipri0

    logical function eigenvalues_are_converged(n_unconv)
       integer, intent(out) :: n_unconv
       integer :: ib

       n_unconv = 0
       eigenvalues_are_converged = .true.
       do ib=1,np_e
          if(.not.feigconv(ib)) then
             eigenvalues_are_converged = .false.
             n_unconv = n_unconv + 1
          end if
       end do
    end function eigenvalues_are_converged

  end subroutine m_ESmddavid_Renew_WF


  subroutine allocate_matrix_3D()
    integer:: i,j

    nblock = npartition_david
    if(np_e .lt. npartition_david) nblock = np_e
!!$    nblock = npartition_mddavid
!!$    if(np_e .lt. npartition_mddavid) nblock = np_e

    allocate(nsize_subspace(nblock))
    allocate(nsize_matrix(nblock))
    allocate(ista_e_l(nblock))
    allocate(iend_e_l(nblock))
    allocate(ielm_e_l(nblock))
    ielm_e_l=np_e/nblock
    j = mod(np_e,nblock)
    do i = 1, j
       ielm_e_l(i) = ielm_e_l(i) + 1
    end do
    ista_e_l(1) = 1
    do i = 2, nblock
       ista_e_l(i) = ista_e_l(i-1) + ielm_e_l(i-1)
       iend_e_l(i-1) = ista_e_l(i) - 1
    end do
    iend_e_l(nblock) = np_e
    do i=1,nblock
      if(sw_MRCV_only==ON)then
         nsize_subspace(i)=ielm_e_l(i)*2
         nsize_matrix(i)=nsize_subspace(i)*(nsize_subspace(i)+1)/2
      else
         nsize_subspace(i)=ielm_e_l(i)*(max_iter_david+1)
         nsize_matrix(i)=nsize_subspace(i)*(nsize_subspace(i)+1)/2
      endif
    end do
    msize_subspace=maxval(nsize_subspace)
    msize_matrix=maxval(nsize_matrix)
    allocate(feigconv(np_e))
    allocate(ibover(msize_subspace,nblock))
    if(sw_MRCV_only==ON)then
       allocate(fsr(np_e,nlmta,2));fsr=0.d0
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
          allocate(fsi(np_e,nlmta,2));fsi=0.d0
       end if
    else
       allocate(fsr(np_e,nlmta,max_iter_david+1))
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
          allocate(fsi(np_e,nlmta,max_iter_david+1))
       end if
    endif
    eps_residual = eps_david
    if(sw_MRCV_only==OFF)then
       allocate(zajold_l(maxval(np_g1k),np_e,kimg))
       allocate(fsrold_l(np_e,np_fs))
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
         allocate(fsiold_l(np_e,np_fs))
       end if
    endif
  end subroutine allocate_matrix_3D

  subroutine deallocate_matrix
    deallocate(nsize_subspace)
    deallocate(nsize_matrix)
    deallocate(ista_e_l)
    deallocate(iend_e_l)
    deallocate(ielm_e_l)
    deallocate(feigconv)
    deallocate(ibover)
    deallocate(fsr)
    if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
       deallocate(fsi)
    end if
    if(sw_MRCV_only==OFF)then
       deallocate(zajold_l)
       deallocate(fsrold_l)
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
         deallocate(fsiold_l)
       end if
    endif
  end subroutine deallocate_matrix

  subroutine allocate_t_matrix_3D(ik)
    integer, intent(in) :: ik
    integer :: kimg_t
    if(k_symmetry(ik) == GAMMA) then
       kimg_t = 1
    else
       kimg_t = kimg
    end if
    allocate(zat_l(maxval(np_g1k),np_e,kimg,max_iter_david+1)) ! MPI
    allocate(zah_l(maxval(np_g1k),np_e,kimg,max_iter_david+1)) ! MPI
    allocate(w1hw2(msize_matrix*kimg_t))
    allocate(w1sw2(msize_matrix*kimg_t))
    allocate(wfsd_l(maxval(np_g1k),np_e,ik:ik,kimg)); wfsd_l = 0.d0
!!$    allocate(bsdr_l(np_e,nlmta,ik:ik)); bsdr_l = 0.d0
!!$    allocate(bsdi_l(np_e,nlmta,ik:ik)); bsdi_l = 0.d0
    allocate(bsdr_l(np_e,np_fs,ik:ik)); bsdr_l = 0.d0
    allocate(bsdi_l(np_e,np_fs,ik:ik)); bsdi_l = 0.d0
    if(sw_MRCV_only==OFF)then
       zajold_l(:,:,:)=zaj_l(:,:,ik,:)
       fsrold_l(:,:)=fsr_l(:,:,ik)
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
         fsiold_l(:,:)=fsi_l(:,:,ik)
       endif
    end if

  end subroutine allocate_t_matrix_3D

  subroutine deallocate_t_matrix
! ================================== modified by K. Tagami =========== 11.0
!    deallocate(zat_l) ! MPI
!    deallocate(zah_l) ! MPI

       deallocate(zat_l) ! MPI
       deallocate(zah_l) ! MPI
! ==================================================================== 11.0
    deallocate(w1hw2)
    deallocate(w1sw2)
    deallocate(wfsd_l)
    deallocate(bsdr_l)
    deallocate(bsdi_l)
    if(sw_divide_subspace==OFF)then
       if ( allocated( zat_l_t ) ) deallocate(zat_l_t)
       if ( allocated( zah_l_t ) ) deallocate(zah_l_t)
       if ( allocated( feigconv_t ) ) deallocate(feigconv_t)
       if ( allocated( fsr_t) ) deallocate(fsr_t)
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
          if ( allocated( fsi_t) ) deallocate(fsi_t)
       endif
       deallocate(w1hw2_mpi)
       deallocate(w1sw2_mpi)
    endif
  end subroutine deallocate_t_matrix

  subroutine prepare_Hloc_phi_3D(ik,ispin,ekin_l,afft_l,bfft_l, &
       &                         wk_bfft_l,lsize,ibsize,isrsize,fft_l_size,itot )
    integer, intent(in) :: ik,ispin,lsize,ibsize,isrsize,fft_l_size
    integer, intent(in) :: itot
    real(kind=DP), intent(in)  :: ekin_l(maxval(np_g1k))
#ifdef FFT_3D_DIVISION
    real(kind=DP), intent(in)  :: afft_l(lsize*2   )
    real(kind=DP), intent(out) :: bfft_l(lsize*2   ,1)
    real(kind=DP), intent(inout) :: wk_bfft_l(lsize*2   ,ibsize)
#else
    real(kind=DP), intent(in)  :: afft_l(lsize*kimg)
    real(kind=DP), intent(out) :: bfft_l(lsize*kimg,ibsize)
    real(kind=DP), intent(inout) :: wk_bfft_l(lsize*kimg,ibsize)
#endif
#ifdef MPI_FFTW
!    complex(kind=DP), pointer :: bfft_mpifftw(:,:,:)
    integer(C_INTPTR_T) :: alloc_local, lx, ly, lz, local_n, local_n_offset
#endif

    integer       :: ib1,i1,ii,ib,iadd
!!$    real(kind=DP), allocatable, dimension(:,:) :: fs_mpi,fs_mpi2
    real(kind=DP), allocatable, dimension(:,:) :: fs_mpi
    real(kind=DP), allocatable, dimension(:,:) :: zah_t ! d(maxval(np_g1k),kimg)
    real(kind=DP) :: denom
    real(kind=DP) :: dr1,dr2,di1,di2,dd
    integer :: is,is1,ib2,ibesize
    integer :: id_sname = -1, id_sname1=-1, id_sname2=-1, id_sname3=-1, ipri0
    real(kind=DP), allocatable, dimension(:,:) :: vtau_phl
    real(kind=DP), allocatable, dimension(:,:) :: vexx
    real(kind=DP) :: exx
    integer :: niter
    integer :: ng, iflag
    integer :: nsize
    call tstatc0_begin('prepare_Hloc_phi_3D ', id_sname,1)

    call get_ipri0(ipridavidson,ipri0)

#ifdef MPI_FFTW
    lx = fft_box_size_WF(1,0)
    ly = fft_box_size_WF(2,0)
    lz = fft_box_size_WF(3,0)
    if(kimg==2)then
      alloc_local = fftw_mpi_local_size_3d(ly,lz,lx,mpi_ke_world,local_n,local_n_offset)
    else
      alloc_local = fftw_mpi_local_size_3d(ly,lz,lx/2,mpi_ke_world,local_n,local_n_offset)
    endif
#endif

    denom = 1.d0/product(fft_box_size_WF(1:3,1))
    if(sw_hybrid_functional==ON) then
        ng = maxval(np_g1k)
        allocate(vexx(ng,kimg))
        vexx = 0.0d0
    endif
    if ( use_metagga .and. vtau_exists ) then
       allocate( vtau_phl(maxval(np_g1k),kimg) );  vtau_phl = 0.0d0
    endif
! (zaj_l <- (T+Vloc)|phi> )
!!    zaj_l(:,:,ik,:) = zajold_l(:,:,:,idavid)
!( tenchi ) (zat_l <- zaj_l)

    call tstatc0_begin('prepare_Hloc(1)', id_sname1)
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call start_timer('prepare_Hloc_phi_3D_000')
#endif
! === TIMERTIMERTIMER ==========================================================

    if(sw_MRCV_only==ON)then
       zat_l(:,:,:,1) = zaj_l(:,:,ik,:)
       zat_l(:,:,:,2) = wfsd_l(:,:,ik,:)
       call m_ES_gather_f_3d_to_2d_k(fsr_l(:,:,ik),  fsr(:,:,1))
       call m_ES_gather_f_3d_to_2d_k(bsdr_l(:,:,ik), fsr(:,:,2))
       if( k_symmetry(ik) /= GAMMA ) then
          call m_ES_gather_f_3d_to_2d_k(fsi_l(:,:,ik),  fsi(:,:,1))
          call m_ES_gather_f_3d_to_2d_k(bsdi_l(:,:,ik), fsi(:,:,2))
       endif
    else
       zat_l(:,:,:,itot+1) = wfsd_l(:,:,ik,:)
!!$       fsr(:,:,itot+1)=bsdr_l(:,:,ik)
       call m_ES_gather_f_3d_to_2d_k(bsdr_l(:,:,ik),  fsr(:,:,itot+1))
       if( k_symmetry(ik) /= GAMMA ) then
         call m_ES_gather_f_3d_to_2d_k(bsdi_l(:,:,ik),  fsi(:,:,itot+1))
       endif
    endif
!   allocate(fs_mpi(np_e,nlmta))
!   if(sw_MRCV_only==ON)then
!      zat_l(:,:,:,1) = zaj_l(:,:,ik,:)
!      zat_l(:,:,:,2) = wfsd_l(:,:,ik,:)
!     fs_mpi=0.d0
!     do is = 1, np_fs ! MPI
!        is1=nis_fs(myrank_g)+is-1
!        do ib = 1, np_e ! MPI
!           fs_mpi(ib,is1) = fsr_l(ib,is,ik)
!        end do
!     end do
!     call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!       & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
!     fsr(1:np_e,1:nlmta,1) = fs_mpi(1:np_e,1:nlmta)
!!$       fsr(:,:,2)=bsdr_l(:,:,ik)
!      fs_mpi=0.d0
!      do is = 1, np_fs ! MPI
!         is1=nis_fs(myrank_g)+is-1
!         do ib = 1, np_e ! MPI
!            fs_mpi(ib,is1) = bsdr_l(ib,is,ik)
!         end do
!      end do
!      call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!        & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
!      fsr(1:np_e,1:nlmta,2) = fs_mpi(1:np_e,1:nlmta)

!      if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
!        fs_mpi=0.d0
!        do is = 1, np_fs ! MPI
!           is1=nis_fs(myrank_g)+is-1
!           do ib = 1, np_e ! MPI
!              fs_mpi(ib,is1) = fsi_l(ib,is,ik)
!           end do
!        end do
!        call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!          & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
!        fsi(1:np_e,1:nlmta,1) = fs_mpi(1:np_e,1:nlmta)

!!$          fsi(:,:,2)=bsdi_l(:,:,ik)
!         fs_mpi=0.d0
!         do is = 1, np_fs ! MPI
!            is1=nis_fs(myrank_g)+is-1
!            do ib = 1, np_e ! MPI
!               fs_mpi(ib,is1) = bsdi_l(ib,is,ik)
!            end do
!         end do
!         call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!              & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)       ! MPI
!         fsi(1:np_e,1:nlmta,2) = fs_mpi(1:np_e,1:nlmta)
!      end if
!   else
!      zat_l(:,:,:,itot+1) = wfsd_l(:,:,ik,:)
!!$       fsr(:,:,itot+1)=bsdr_l(:,:,ik)
!      fs_mpi=0.d0
!      do is = 1, np_fs ! MPI
!         is1=nis_fs(myrank_g)+is-1
!         do ib = 1, np_e ! MPI
!            fs_mpi(ib,is1) = bsdr_l(ib,is,ik)
!         end do
!      end do
!      call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!        & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)       ! MPI
!      fsr(1:np_e,1:nlmta,itot+1) = fs_mpi(1:np_e,1:nlmta)

!      if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
!!$          fsi(:,:,itot+1)=bsdi_l(:,:,ik)
!         fs_mpi=0.d0
!         do is = 1, np_fs ! MPI
!            is1=nis_fs(myrank_g)+is-1
!            do ib = 1, np_e ! MPI
!               fs_mpi(ib,is1) = bsdi_l(ib,is,ik)
!            end do
!         end do
!         call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!              & ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)       ! MPI
!         fsi(1:np_e,1:nlmta,itot+1) = fs_mpi(1:np_e,1:nlmta)
!      end if
!   endif
!   deallocate(fs_mpi)
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('prepare_Hloc_phi_3D_000')
#endif
! === TIMERTIMERTIMER ==========================================================

    call tstatc0_end(id_sname1)

!!    call tstatc0_begin('prepare_Hloc(2)', id_sname2)
!!    if(itot == 1) then
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call start_timer('prepare_Hloc_phi_3D_001')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!!!$      do ib1 = 1, np_e ! MPI
!!      do ib1 = 1, np_e, ibsize
!!!!$         ib = ib1
!!         ib2 = min(ib1+ibsize-1,np_e)
!!         ibesize = ib2 - ib1 + 1
!!
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call start_timer('- 001 m_ES_WF_in_Rspace_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_COMM__
!!         call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l,0)
!!#else
!!#ifdef MPI_FFTW
!!         if(sw_mpi_fftw==ON) then
!!           call m_ES_WF_in_Rspace_mpifftw(ista_k,iend_k,ik,ib1,zaj_l)
!!         else
!!           call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
!!         endif
!!#else
!!         call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
!!#endif
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 m_ES_WF_in_Rspace_3D')
!!call start_timer('- 001 m_FFT_Vlocal_W_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef FFT_3D_DIVISION
!!       call m_FFT_Vlocal_W_3DIV_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_x(myrank_g))
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 m_FFT_Vlocal_W_3D')
!!call start_timer('- 001 m_FFT_Direct_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!       call m_FFT_Direct_3DIV_3D(nfout,wk_bfft_l,lsize,ibsize)
!!#else
!!#ifdef MPI_FFTW
!!       if(sw_mpi_fftw==ON) then
!!         nsize = local_n*lx*ly
!!!         call m_FFT_Vlocal_W_mpifftw(afft_l,lsize,nsize)
!!         call m_FFT_Vlocal_W_mpifftw3d(afft_mpifftw_vlocal,lx,local_n,lz)
!!         call m_FFT_Direct_MPI_FFTW(nfout)
!!       else
!!         call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
!!         call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
!!       endif
!!#else
!!       if (sw_fft_xzy > 0) then
!!          call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_y(myrank_g))
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 m_FFT_Vlocal_W_3D')
!!call start_timer('- 001 m_FFT_Direct_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!          call m_FFT_Direct_3D(nfout,wk_bfft_l,lsize,ibsize)
!!       else
!!          call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 m_FFT_Vlocal_W_3D')
!!call start_timer('- 001 m_FFT_Direct_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!          if(sw_serial_fft == ON) then
!!             call m_ES_WF_2D(ik,wk_bfft_l,ib2,ib1,ibsize,lsize,DIRECT)
!!          else
!!             call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
!!          endif
!!       end if
!!#endif
!!#endif
! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 m_FFT_Direct_3D')
!!call start_timer('- 001 map_fft_to_WF_3D')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef MPI_FFTW
!!         if(sw_mpi_fftw==ON) then
!!           call map_fft_to_WF_mpifftw(ik,lsize,ibesize,bfft_l,isrsize,fft_l_size)
!!         else
!!           call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
!!         endif
!!#else
!!         call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 map_fft_to_WF_3D')
!!call start_timer('- 001 others')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!         do ib = ib1, ib2
!!
!!            if(sw_hybrid_functional==ON) then
!!               if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
!!                  call m_ES_EXX_potential(nfout,ispin,ib,ik,ng,fsr_l(ib,:,ik),fsi_l(ib,:,ik),vexx,0)
!!               else
!!                  call m_ES_EXX_potential(nfout,ispin,ib,ik,ng,fsr_l(ib,:,ik),fsr_l(ib,:,ik),vexx,0)
!!               endif
!!            endif
!!
!!            if(kimg == 1) then
!!               do ii=ista_g1k(ik),iend_g1k(ik)
!!                  iadd = ii - ista_g1k(ik) + 1
!!                  dr1 = zaj_l(iadd,ib,ik,1)
!!                  dr2 = bfft_l(iadd,ib-ib1+1)*denom
!!                  zah_l(iadd,ib,1,1) = ekin_l(iadd)*dr1+dr2
!!               enddo
!!               if(sw_hybrid_functional==ON)then
!!                  zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
!!               endif
!!            else
!!               do ii=ista_g1k(ik),iend_g1k(ik)
!!                  iadd = ii - ista_g1k(ik) + 1
!!                  dr1  = zaj_l(iadd,ib,ik,1)
!!                  di1  = zaj_l(iadd,ib,ik,kimg)
!!                  zah_l(iadd,ib,1,   1)= ekin_l(iadd)*dr1+bfft_l(2*iadd-1,ib-ib1+1)*denom
!!                  zah_l(iadd,ib,kimg,1)= ekin_l(iadd)*di1+bfft_l(2*iadd,  ib-ib1+1)*denom
!!               enddo
!!               if(sw_hybrid_functional==ON)then
!!                  zah_l(:,ib,1,1) = zah_l(:,ib,1,1) + vexx(:,1)
!!                  zah_l(:,ib,kimg,1) = zah_l(:,ib,kimg,1) + vexx(:,kimg)
!!               endif
!!            endif
!!         end do
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('- 001 others')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!      enddo
!!! === TIMERTIMERTIMER ==========================================================
!!#ifdef __TIMER_FFT__
!!call stop_timer('prepare_Hloc_phi_3D_001')
!!#endif
!!! === TIMERTIMERTIMER ==========================================================
!!    end if
!!    call tstatc0_end(id_sname2)

    call tstatc0_begin('prepare_Hloc(3)', id_sname3)
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call start_timer('prepare_Hloc_phi_3D_002')
#endif
! === TIMERTIMERTIMER ==========================================================
!!( tenchi ) (zah_l <- zaj_l)
    zaj_l(:,:,ik,:) = wfsd_l(:,:,ik,:)
#ifdef SAVE_FFT_TIMES
    if(sw_save_fft == ON) status_saved_phifftr(:,ik) = OLD
#endif

!!$    do ib1 = 1, np_e ! MPI
    do ib1 = 1, np_e, ibsize
       if (feigconv(ib1)) cycle
       ib2 = min(ib1+ibsize-1,np_e)
       ibesize = ib2 - ib1 + 1
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call start_timer('- 002 m_ES_WF_in_Rspace_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_COMM__
       call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l,0)
#else
#ifdef MPI_FFTW
         if(sw_mpi_fftw==ON) then
           call m_ES_WF_in_Rspace_mpifftw(ista_k,iend_k,ik,ib1,zaj_l)
         else
           call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
         endif
#else
         call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
#endif
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 m_ES_WF_in_Rspace_3D')
call start_timer('- 002 m_FFT_Vlocal_W_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef FFT_3D_DIVISION
       call m_FFT_Vlocal_W_3DIV_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_x(myrank_g))
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 m_FFT_Vlocal_W_3D')
call start_timer('- 002 m_FFT_Direct_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
       call m_FFT_Direct_3DIV_3D(nfout,wk_bfft_l,lsize,ibsize)
#else
#ifdef MPI_FFTW
       if(sw_mpi_fftw==ON) then
         nsize = local_n*lx*ly
!         call m_FFT_Vlocal_W_mpifftw(afft_l,lsize,nsize)
         call m_FFT_Vlocal_W_mpifftw3d(afft_mpifftw_vlocal,lx,local_n,lz)
         call m_FFT_Direct_MPI_FFTW(nfout)
       else
         call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
         call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
       endif
#else
       if (sw_fft_xzy > 0) then
          call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_y(myrank_g))
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 m_FFT_Vlocal_W_3D')
call start_timer('- 002 m_FFT_Direct_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
          call m_FFT_Direct_3D(nfout,wk_bfft_l,lsize,ibsize)
       else
          call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 m_FFT_Vlocal_W_3D')
call start_timer('- 002 m_FFT_Direct_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
        if(sw_serial_fft ==ON) then
          call m_ES_WF_2D(ik,wk_bfft_l,ib2,ib1,ibsize,lsize,DIRECT)
        else
          call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
        endif
       end if
#endif
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 m_FFT_Direct_3D')
call start_timer('- 002 map_fft_to_WF_3D')
#endif
! === TIMERTIMERTIMER ==========================================================
#ifdef MPI_FFTW
         if(sw_mpi_fftw==ON) then
           call map_fft_to_WF_mpifftw(ik,lsize,ibesize,bfft_l,isrsize,fft_l_size)
         else
           call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
         endif
#else
         call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
#endif

! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 map_fft_to_WF_3D')
call start_timer('- 002 others')
#endif
! === TIMERTIMERTIMER ==========================================================
       allocate(zah_t(maxval(np_g1k),kimg))
!!$       if(sw_MRCV_only==ON) then
       do ib = ib1, ib2
          if(sw_hybrid_functional==ON) then
             if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
                call m_ES_EXX_potential(nfout,ispin,ib,ik,ng,bsdr_l(ib,:,ik),bsdi_l(ib,:,ik),vexx)
             else
                call m_ES_EXX_potential(nfout,ispin,ib,ik,ng,bsdr_l(ib,:,ik),bsdr_l(ib,:,ik),vexx)
             endif
          endif

          if ( use_metagga .and. vtau_exists ) then
#ifdef MPI_FFTW
             if ( sw_mpi_fftw == ON ) then
                call m_ES_kindens_to_vnlph_ib2_mpfw( ispin, ik, ib, lsize, &
                     &               lx, ly, lz, local_n, cfft_mpifftw_vlocal, vtau_phl )
             else
                call m_ES_kindens_to_vnlph_ib2( ispin, ik, ib, lsize, cfft_l, vtau_phl )
             endif
#else
             call m_ES_kindens_to_vnlph_ib2( ispin, ik, ib, lsize, cfft_l, vtau_phl )
#endif
          endif

          if(kimg == 1) then
             do ii=ista_g1k(ik),iend_g1k(ik)
                iadd = ii - ista_g1k(ik) + 1
                dr1 = zaj_l(iadd,ib,ik,1)
                dr2 = bfft_l(iadd,ib-ib1+1)*denom
!!$                zah_l(iadd,ib,1,2) = ekin_l(iadd)*dr1+dr2
                zah_t(iadd,1) = ekin_l(iadd)*dr1+dr2
             enddo
             if(sw_MRCV_only == ON) then
                zah_l(:,ib,1,2) = zah_t(:,1)
             else
                zah_l(:,ib,1,itot+1) = zah_t(:,1)
             end if
             if(sw_hybrid_functional==ON)then
                if(sw_MRCV_only==ON)then
                   zah_l(:,ib,1,2) = zah_l(:,ib,1,2) + vexx(:,1)
                else
                   zah_l(:,ib,1,itot+1) = zah_l(:,ib,1,itot+1) + vexx(:,1)
                endif
             endif
             if( use_metagga .and. vtau_exists ) then
                if(sw_MRCV_only==ON)then
                   zah_l(:,ib,1,2)= zah_l(:,ib,1,2) +vtau_phl(:,1)
                else
                   zah_l(:,ib,1,itot+1)= zah_l(:,ib,1,itot+1)+vtau_phl(:,1)
                endif
             endif
          else
             do ii=ista_g1k(ik),iend_g1k(ik)
                iadd = ii - ista_g1k(ik) + 1
                dr1  = zaj_l(iadd,ib,ik,1)
                di1  = zaj_l(iadd,ib,ik,kimg)
!!$                zah_l(iadd,ib,1,   2) = ekin_l(iadd)*dr1+bfft_l(2*iadd-1,ib-ib1+1)*denom
!!$                zah_l(iadd,ib,kimg,2) = ekin_l(iadd)*di1+bfft_l(2*iadd,  ib-ib1+1)*denom
                zah_t(iadd,1   ) = ekin_l(iadd)*dr1+bfft_l(2*iadd-1,ib-ib1+1)*denom
                zah_t(iadd,kimg) = ekin_l(iadd)*di1+bfft_l(2*iadd,  ib-ib1+1)*denom
             enddo
             if(sw_MRCV_only==ON) then
                zah_l(:,ib,1,   2) = zah_t(:,1)
                zah_l(:,ib,kimg,2) = zah_t(:,kimg)
             else
                zah_l(:,ib,1,   itot+1) = zah_t(:,1   )
                zah_l(:,ib,kimg,itot+1) = zah_t(:,kimg)
             end if
             if(sw_hybrid_functional==ON)then
                if(sw_MRCV_only==ON)then
                   zah_l(:,ib,1,2)= zah_l(:,ib,1,2)+vexx(:,1)
                   zah_l(:,ib,kimg,2)= zah_l(:,ib,kimg,2)+vexx(:,kimg)
                else
                   zah_l(:,ib,1,itot+1)= zah_l(:,ib,1,itot+1)+vexx(:,1)
                   zah_l(:,ib,kimg,itot+1)= zah_l(:,ib,kimg,itot+1)+vexx(:,kimg)
                endif
             endif
             if( use_metagga .and. vtau_exists ) then
                if(sw_MRCV_only==ON)then
                   zah_l(:,ib,1,2)= zah_l(:,ib,1,2)+vtau_phl(:,1)
                   zah_l(:,ib,kimg,2)= zah_l(:,ib,kimg,2)+vtau_phl(:,kimg)
                else
                   zah_l(:,ib,1,itot+1)= zah_l(:,ib,1,itot+1)+vtau_phl(:,1)
                   zah_l(:,ib,kimg,itot+1)= zah_l(:,ib,kimg,itot+1)+vtau_phl(:,kimg)
                endif
             endif
          endif
       end do
       deallocate(zah_t)
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('- 002 others')
#endif
! === TIMERTIMERTIMER ==========================================================
    end do
! === TIMERTIMERTIMER ==========================================================
#ifdef __TIMER_FFT__
call stop_timer('prepare_Hloc_phi_3D_002')
#endif
! === TIMERTIMERTIMER ==========================================================

    if(sw_hybrid_functional==ON) deallocate(vexx)
    if ( allocated(vtau_phl) ) deallocate( vtau_phl )

    call tstatc0_end(id_sname3)

    call tstatc0_end(id_sname)
!    stop

  contains

    subroutine get_ipri0(ipri_in, ipri_out)
      integer, intent(in)  :: ipri_in
      integer, intent(out) :: ipri_out
      if(npes > 1) then
         if(mype == 0) ipri_out = ipri_in
         call mpi_bcast(ipri_out,1,mpi_integer,0,mpi_k_world(myrank_k),ierr)
      else
         ipri_out = ipri_in
      end if
    end subroutine get_ipri0

  end subroutine prepare_Hloc_phi_3D

  subroutine evolve_WFs_in_subspace_3D(ik,ispin,iblock,itot,frestart)
    integer, intent(in) :: ik,ispin
    integer, intent(in) :: iblock,itot
    logical, intent(out) :: frestart
! (allocatable variables)
    real(kind=DP), allocatable,dimension(:) ::     eig
    real(kind=DP), allocatable,dimension(:,:) ::   vec

    integer :: i,iel

    integer       :: ib1,ib2,ib1to,ib2to,i1,ii,ri,ib,iadd
    integer       :: ibb1,ibb2
    integer       :: ii1,ii2,iter,iter1,iter2
    real(kind=DP) :: eko1, eko2, ekod
    real(kind=DP) :: hr2,hi2,dr1,dr2,di1,di2,dd
    integer :: ip0,ip0b,ip1,ip1b,ib1n,ib2n,ndata,nshift,kimg_t,ig1
    integer :: noffset
    integer :: nsize_max_sb_now
    integer :: ierr_diag
    integer :: nel,nsta,nend
    integer :: ng
    integer :: id_sname = -1, ipri0
#ifdef NEC_TUNE
    integer :: num, k, lda, ldb, ldc
    real(kind=DP), allocatable, dimension(:,:) :: w1hw2_, w1sw2_
    real(kind=DP), allocatable, dimension(:,:,:) :: zat_t, zah_t
    real(kind=DP), allocatable, dimension(:,:) :: zat_t1, zah_t1
    complex(kind=DP), parameter :: c0 = (0.0d0, 0.0d0), c1 = (1.0d0, 0.0d0)
#endif
    call tstatc0_begin('evolve_WFs_in_subspace  ', id_sname,1)

    call get_ipri0(ipridavidson,ipri0)

    if(k_symmetry(ik) == GAMMA) then
       kimg_t = 1
    else
       kimg_t = kimg
    end if

    nel =ielm_e_l(iblock)
    nsta=ista_e_l(iblock)
    nend=iend_e_l(iblock)

    if(sw_divide_subspace==OFF)then
       nel=neg
       nsta=1
       nend=neg
    endif
    allocate(eko_d(neg));eko_d=0.d0
    allocate(occup(neg));occup=0.d0
    if(sw_divide_subspace==ON)then
        zah_l_p => zah_l
        zat_l_p => zat_l
        fsr_p   => fsr
        fsi_p   => fsi
        feigconv_p => feigconv
        do iel=1,nel
!           eko_d(nsta+iel-1) = eko_l(iel,ik)
!           occup(nsta+iel-1) = occup_l(iel,ik)
           eko_d(nsta+iel-1) = eko_l(nsta+iel-1,ik)
           occup(nsta+iel-1) = occup_l(nsta+iel-1,ik)
        enddo
    else
    endif
    if(ipridavidson>=2 .and. ik==1) write(nfout,'(" !### zaj_l is new,  bfft is old")')

    if(sw_MRCV_only==ON)then
       ip0=nel
       do ib=1,nel
         ibover(ib,iblock)=ib
         if(.not.feigconv_p(nsta+ib-1)) then
           ip0=ip0+1
           ibover(nel+ib,iblock) = ip0
         else
           ibover(nel+ib,iblock) = -1
         end if
       end do
       nsize_sb_now = ip0
       nsize_mt_now = nsize_sb_now*(nsize_sb_now+1)/2
       nsize_max_sb_now = nsize_subspace(iblock)
    else
       if(itot==1) then
         do ib=1,nel
           ibover(ib,iblock)=ib
         end do
         nsize_subspace(iblock)=nel
       end if

       noffset=nel*itot
       ip0=nsize_subspace(iblock)
       do ib=1,nel
         if(.not.feigconv_p(nsta+ib-1)) then
           ip0=ip0+1
           ibover(noffset+ib,iblock) = ip0
         else
           ibover(noffset+ib,iblock) = -1
         end if
       end do

       nsize_sb_now = ip0
       nsize_subspace(iblock) = ip0
       nsize_mt_now = nsize_sb_now*(nsize_sb_now+1)/2
       nsize_max_sb_now = nel*(itot+1)                   !nsize_subspace(iblock)
    endif

    if(ipridavidson >=2) then
       write(nfout,*) 'ibover=',ibover(1:nsize_max_sb_now,iblock)
    end if
!
    allocate(eig(nsize_sb_now)); eig=0.d0
    allocate(vec(nsize_sb_now*kimg_t,nsize_sb_now))

    if(ipridavidson >=2) then
       write(nfout,*) 'Modified Davidson:ik,iblock,nsize_sb_now=', ik,iblock,nsize_sb_now
    end if

    ng = iba(ik)
    if(sw_divide_subspace==OFF) ng = np_g1k(ik)
    eko1 = sum(eko_d(1:neg))
!! (make matrix elements )
!    ! <n|T+Vloc|m> !
#ifdef NEC_TUNE
    allocate(w1hw2_(nsize_sb_now*kimg_t,nsize_sb_now))
    allocate(w1sw2_(nsize_sb_now*kimg_t,nsize_sb_now))

    if(kimg == 1) then
       allocate(zah_t(maxval(np_g1k),nsize_sb_now,1))
       allocate(zat_t(maxval(np_g1k),nsize_sb_now,1))
       num = 0
       do ibb1 = 1, nsize_max_sb_now
          if(ibover(ibb1,iblock) < 0) cycle
          iter1 = (ibb1-1)/nel+1
          ii1  = ibb1-nel*(iter1-1)
          ii1  = nsta+ii1-1
          num = num + 1
          zah_t(:,num,1) = zah_l(:,ii1,1,iter1)
          zat_t(:,num,1) = zat_l(:,ii1,1,iter1)
       end do
       k = np_g1k(ik); lda = maxval(np_g1k); ldb = maxval(np_g1k); ldc = nsize_sb_now
       call dgemm('T','N',num,num,k,1.0d0,zat_t,lda,zah_t,ldb,0.0d0,w1hw2_,ldc)
       call dgemm('T','N',num,num,k,1.0d0,zat_t,lda,zat_t,ldb,0.0d0,w1sw2_,ldc)
    else
      if(k_symmetry(ik) == GAMMA) then
         allocate(zah_t(maxval(np_g1k),nsize_sb_now,kimg))
         allocate(zat_t(maxval(np_g1k),nsize_sb_now,kimg))
         allocate(zah_t1(nsize_sb_now,kimg))
         allocate(zat_t1(nsize_sb_now,kimg))
         num = 0
         do ibb1 = 1, nsize_max_sb_now
            if(ibover(ibb1,iblock) < 0) cycle
            iter1 = (ibb1-1)/nel+1
            ii1  = ibb1-nel*(iter1-1)
            ii1  = nsta+ii1-1
            num = num + 1
            zah_t(:,num,1) = zah_l(:,ii1,1,iter1)
            zah_t(:,num,2) = zah_l(:,ii1,2,iter1)
            zat_t(:,num,1) = zat_l(:,ii1,1,iter1)
            zat_t(:,num,2) = zat_l(:,ii1,2,iter1)
         end do
         if(ista_g1k(ik) == 1) then
            zat_t1(:,:) = zat_t(1,:,:)
            zah_t1(:,:) = zah_t(1,:,:)
            zat_t(1,:,:) = zat_t(1,:,:)/sqrt(2.0d0)
            zah_t(1,:,:) = zah_t(1,:,:)/sqrt(2.0d0)
         end if
         k = np_g1k(ik); lda = maxval(np_g1k); ldb = maxval(np_g1k); ldc = nsize_sb_now
         call dgemm('T','N',num,num,k,2.0d0,zat_t(1,1,1),lda,zah_t(1,1,1),ldb,0.0d0,w1hw2_,ldc)
         call dgemm('T','N',num,num,k,2.0d0,zat_t(1,1,2),lda,zah_t(1,1,2),ldb,1.0d0,w1hw2_,ldc)
         call dgemm('T','N',num,num,k,2.0d0,zat_t(1,1,1),lda,zat_t(1,1,1),ldb,0.0d0,w1sw2_,ldc)
         call dgemm('T','N',num,num,k,2.0d0,zat_t(1,1,2),lda,zat_t(1,1,2),ldb,1.0d0,w1sw2_,ldc)
         if(ista_g1k(ik) == 1) then
            zat_t(1,:,:) = zat_t1(:,:)
            zah_t(1,:,:) = zah_t1(:,:)
         end if
         deallocate(zah_t1)
         deallocate(zat_t1)
      else
         allocate(zah_t(maxval(np_g1k)*kimg,nsize_sb_now,1))
         allocate(zat_t(maxval(np_g1k)*kimg,nsize_sb_now,1))
         num = 0
         do ibb1 = 1, nsize_max_sb_now
            if(ibover(ibb1,iblock) < 0) cycle
            iter1 = (ibb1-1)/nel+1
            ii1  = ibb1-nel*(iter1-1)
            ii1  = nsta+ii1-1
            num = num + 1
            do ii = 1, np_g1k(ik)
               zah_t(2*ii-1,num,1) = zah_l(ii,ii1,1,iter1)
               zah_t(2*ii,  num,1) = zah_l(ii,ii1,2,iter1)
               zat_t(2*ii-1,num,1) = zat_l(ii,ii1,1,iter1)
               zat_t(2*ii,  num,1) = zat_l(ii,ii1,2,iter1)
            end do
         end do
         k = np_g1k(ik); lda = maxval(np_g1k); ldb = maxval(np_g1k); ldc = nsize_sb_now
         call zgemm('C','N',num,num,k,c1,zat_t,lda,zah_t,ldb,c0,w1hw2_,ldc)
         call zgemm('C','N',num,num,k,c1,zat_t,lda,zat_t,ldb,c0,w1sw2_,ldc)
      end if
    end if
#else
    do ibb2 = 1,nsize_max_sb_now
       if(ibover(ibb2,iblock)<0) cycle
       ib2 = ibover(ibb2,iblock)
       iter2 = (ibb2-1)/nel+1
       ii2  = ibb2-nel*(iter2-1)
       ii2  = nsta+ii2-1
       ip0b = ib2*(ib2-1)/2
       do ibb1 = 1,ibb2
          if(ibover(ibb1,iblock)<0) cycle
          ib1 = ibover(ibb1,iblock)
          iter1 = (ibb1-1)/nel+1
          ii1 = ibb1-nel*(iter1-1)
          ii1 = nsta+ii1-1
          ip0 = ip0b + ib1
          if(kimg == 1) then
             w1hw2(ip0) = 0.d0
             w1sw2(ip0) = 0.d0
             do ii = ista_g1k(ik),iend_g1k(ik) ! MPI
                iadd = ii - ista_g1k(ik) + 1
                hr2 = zah_l(iadd,ii2,1,iter2)
                dr2 = zat_l(iadd,ii2,1,iter2)
                dr1 = zat_l(iadd,ii1,1,iter1)
                w1hw2(ip0) = w1hw2(ip0) + dr1*hr2
                w1sw2(ip0) = w1sw2(ip0) + dr1*dr2
             end do
             call mpi_allreduce(MPI_IN_PLACE,w1hw2(ip0),1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
             call mpi_allreduce(MPI_IN_PLACE,w1sw2(ip0),1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
          else
             if(k_symmetry(ik) == GAMMA) then
                w1hw2(ip0) = 0.d0
                w1sw2(ip0) = 0.d0
                do ii = max(2,ista_g1k(ik)), iend_g1k(ik) ! MPI
                   iadd = ii - ista_g1k(ik) + 1
                   hr2 = zah_l(iadd,ii2,1,iter2) ! MPI
                   hi2 = zah_l(iadd,ii2,2,iter2) ! MPI
                   dr2 = zat_l(iadd,ii2,1,iter2) ! MPI
                   di2 = zat_l(iadd,ii2,2,iter2) ! MPI
                   dr1 = zat_l(iadd,ii1,1,iter1) ! MPI
                   di1 = zat_l(iadd,ii1,2,iter1) ! MPI
                   w1hw2(ip0) =w1hw2(ip0)+(dr1*hr2+di1*hi2)*2.d0
                   w1sw2(ip0) =w1sw2(ip0)+(dr1*dr2+di1*di2)*2.d0
                end do
                if(ista_g1k(ik) == 1) then
                   hr2 = zah_l(1,ii2,1,iter2) ! MPI
                   hi2 = zah_l(1,ii2,2,iter2) ! MPI
                   dr2 = zat_l(1,ii2,1,iter2) ! MPI
                   di2 = zat_l(1,ii2,2,iter2) ! MPI
                   dr1 = zat_l(1,ii1,1,iter1) ! MPI
                   di1 = zat_l(1,ii1,2,iter1) ! MPI
                   w1hw2(ip0) =w1hw2(ip0)+dr1*hr2+di1*hi2
                   w1sw2(ip0) =w1sw2(ip0)+dr1*dr2+di1*di2
                end if
                call mpi_allreduce(MPI_IN_PLACE,w1hw2(ip0),1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
                call mpi_allreduce(MPI_IN_PLACE,w1sw2(ip0),1,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
             else
                w1hw2(2*ip0-1:2*ip0) = 0.d0
                w1sw2(2*ip0-1:2*ip0) = 0.d0
                do ii = ista_g1k(ik), iend_g1k(ik)
                   iadd = ii - ista_g1k(ik) + 1
                   hr2 = zah_l(iadd,ii2,1,iter2) ! MPI
                   hi2 = zah_l(iadd,ii2,2,iter2) ! MPI
                   dr2 = zat_l(iadd,ii2,1,iter2) ! MPI
                   di2 = zat_l(iadd,ii2,2,iter2) ! MPI
                   dr1 = zat_l(iadd,ii1,1,iter1) ! MPI
                   di1 = zat_l(iadd,ii1,2,iter1) ! MPI
                   w1hw2(2*ip0-1) =w1hw2(2*ip0-1)+dr1*hr2+di1*hi2
                   w1hw2(2*ip0  ) =w1hw2(2*ip0  )+dr1*hi2-di1*hr2
                   w1sw2(2*ip0-1) =w1sw2(2*ip0-1)+dr1*dr2+di1*di2
                   w1sw2(2*ip0  ) =w1sw2(2*ip0  )+dr1*di2-di1*dr2
                end do
                call mpi_allreduce(MPI_IN_PLACE,w1hw2(2*ip0-1),2,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
                call mpi_allreduce(MPI_IN_PLACE,w1sw2(2*ip0-1),2,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
             end if
          end if
       end do
    end do
#endif
    if(ipridavidson >= 3 .and. ik == 1 .and. iblock == 1) call wd_w1hw2(" -- w1hw2 without nl part--",iblock)
    ! <n|Vnl|m>
!   if(myrank_g == 0) then
    call add_nonlocal_part ! w1hw2 = w1hw2 + w1Vnlw2
                           ! w1sw2 = w1sw2 + w1qw2
!   endif
#ifdef NEC_TUNE
    call mpi_allreduce(MPI_IN_PLACE,w1hw2_,nsize_sb_now*nsize_sb_now*kimg_t,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
    call mpi_allreduce(MPI_IN_PLACE,w1sw2_,nsize_sb_now*nsize_sb_now*kimg_t,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
#endif
    if(ipridavidson >= 3 .and. ik == 1 .and. iblock == 1) call wd_w1hw2(" -- w1hw2 with nl part--",iblock)
!
!!$    if(ipridavidson >= 2) call wd_w1hw2(" -- just after making w1hw2 --",iblock)
!    if(ipridavidson >= 2) then
!       write(nfout,*) 'neordr for ik = ',ik
!       write(nfout,9002) (neordr(ib1,ik),ib1=1,neg)
!       write(nfout,*) 'nrvf_ordr for ik = ',ik
!       write(nfout,9002) (nrvf_ordr(ib1,ik),ib1=1,neg)
!       write(nfout,*) 'eig'
!       write(nfout,'(5x,10f8.4)') (eko_l(ib1,ik),ib1=1,np_e)
!    endif
!9002 format(5x,10i8)

!! (Diagonalization )  !!

#ifdef NEC_TUNE
    if(kimg_t == 1) then
       call dsygvx_driver(eig, vec, w1hw2_, w1sw2_, ierr_diag, nel)
    else
       call zhegvx_driver(eig, vec, w1hw2_, w1sw2_, ierr_diag, nel)
    endif
!    deallocate(w1hw2_)
!    deallocate(w1sw2_)
#else
    if(kimg_t == 1) then
       call dspgvx_driver_loc(eig,vec,w1hw2,w1sw2,ierr_diag,nel)
    else
       call zhpgvx_driver(eig,vec,w1hw2,w1sw2,ierr_diag,nel)
    endif
#endif

    frestart = .false.
    if(ierr_diag /= 0) then
!       zaj_l(:,:,ik,:) = zaj_l_backup(:,:,:)
!       do ib1 = 1, neg
!          if(map_e(ib1) == myrank_e) then         ! MPI
!             eko_l(map_z(ib1),ik)=eko_d(ib1)
!          end if
!       end do
      frestart = .true.
! === DEBUG by T.Kato 2013/08/01 ===============================================
!     do i=1,np_e
      do i=1,nel
! ==============================================================================
         zaj_l(:,i-1+nsta,ik,:)=zat_l(:,i,:,1)
      enddo
#ifdef SAVE_FFT_TIMES
      if(sw_save_fft == ON) then
         do i=1,nel
            status_saved_phifftr(i-1+nsta,ik) = OLD
         end do
      end if
#endif
      if(ipridavidson >= 2) then
        write(nfout,*) '** restart Modified Davidson iteration **'
        write(nfout,*) 'ik=',ik,' iblock=',iblock,' itot=',itot
      end if
      write(nfout,*) '** restart Modified Davidson **'
      write(nfout,*) 'ik=',ik,' iblock=',iblock,' itot=',itot
      call wd_w1hw2(" -- restart Modified Davidson iteration --",iblock)
!!$ print *,'Restart'
      goto 9000
    else

!      feigconv(nsta:nend) = .false.
      do ib=1,nel
         if(occup(nsta+ib-1) > 0.d0) then
            if(abs(eko_d(nsta+ib-1)-eig(ib)) < delta_eig_occup) &
                                                   feigconv(nsta+ib-1) = .true.
            if(feigconv(nsta+ib-1)) icountocc=icountocc+1
         else
            if(abs(eko_d(nsta+ib-1)-eig(ib)) < delta_eig_empty) &
                                                   feigconv(nsta+ib-1) = .true.
            if(feigconv(nsta+ib-1)) icountuocc=icountuocc+1
         end if
      end do
      if(ipri0 >= 2) then
         if(ik == 1 .and. iblock == 1) then
            write(nfout,*) 'eko_l for ik = ',ik
            write(nfout,*) 'iblock       = ',iblock
            write(nfout,9001) (eko_d(nsta+ib-1),ib=1,nel)
            write(nfout,*) 'eig for ik = ',ik
            write(nfout,9001) (eig(ib),ib=1,nel)
            if(ik == 1 .and. iblock == 1) call wd_w1hw2(" -- w1hw2 with nl part--",iblock)
            call wd_w1hw2(" -- after diagonalization --",iblock)
!sum eko
            dr1=0.d0;dr2=0.d0
            do ib1=1,nel
               dr1=dr1+eko_d(nsta+ib1-1) ! MPI
               dr2=dr2+eig(ib1)
            enddo
            write(nfout,'(" sum of eko_l, eig, abs diff =",3e25.10)') dr1,dr2,abs(dr2-dr1)
         end if
      endif
    endif

!!! (subspace rotation) !!
       call subspace_rotation ! vec,zat_l -> zat_l
       if(ipridavidson>=2 .and. ik==1 .and. iblock == 1) write(nfout,'(" !### zaj_l is new,  bfft is old")')
!!( tenchi ) (zaj_l <- zat_l)
!       iter = min(idavid+1,ndavid)
!       call m_ES_W_transpose_back(ista_k,iend_k,ik,zaj_l,zat_l(1,1,1,iter))
!!       zaj_l_backup(:,nsta:nend,:) = zaj_l(:,nsta:nend,ik,:)
!! (eko_l)
    if(sw_divide_subspace==ON)then
       do ib1 = 1, nel
          eko_l(nsta+ib1-1,ik)=eig(ib1)
       end do
    else
       do ib1 = 1,neg
          if(map_e(ib1)==myrank_e) eko_l(map_z(ib1),ik) = eig(ib1)
       enddo
    endif

    if(ipridavidson >= 2) then
       if(ik == 1 .and. iblock == 1) then
          eko2 = sum(eig(1:nel))
          write(nfout,1201) ik,eko1,eko2

          write(nfout,*) 'eko_l'
          write(nfout,9001) (eko_d(nsta+ib1-1),ib1=1,nel)
       end if
    endif

1201 format(' %% for ik = ',i4,4x,' eko1&ekod&eko2 = ',3f14.7)
9001 format(5x,6f12.5)
!!! (neordr & nrvf_ordr)
!
9000 continue
!    neordr(1:neg,ik) = (/(ib1,ib1=1,neg)/)
!    nrvf_ordr(1:neg,ik) = (/(ib1,ib1=1,neg)/)
!
!! (deallocate)
#ifdef NEC_TUNE
    deallocate(zah_t)
    deallocate(zat_t)
#endif
    deallocate(eko_d)
    deallocate(occup)

    deallocate(eig)
    deallocate(vec)
#ifdef NEC_TUNE
    deallocate(w1hw2_)
    deallocate(w1sw2_)
#endif
    nullify(zah_l_p,zat_l_p,fsr_p,fsi_p,feigconv_p)
!
    call tstatc0_end(id_sname)
!
  contains
    subroutine get_ipri0(ipri_in, ipri_out)
      integer, intent(in)  :: ipri_in
      integer, intent(out) :: ipri_out
      if(npes > 1) then
         if(mype == 0) ipri_out = ipri_in
         call mpi_bcast(ipri_out,1,mpi_integer,0,mpi_k_world(myrank_k),ierr)
      else
         ipri_out = ipri_in
      end if
    end subroutine get_ipri0
!
#ifdef NEC_TUNE
    subroutine wd_w1hw2(somecomment,iblock)
      character(len=*), intent(in) :: somecomment
      integer,intent(in) :: iblock
      integer :: ib1, ib2, nel_wd, nsb_wd
      write(nfout,'(a35)') somecomment
      write(nfout,*) 'w1hw2 for ik = ',ik
      nel_wd = 8
      nsb_wd = 8
      if(nel_wd > nel) nel_wd = nel
      if(nsb_wd > nsize_sb_now) nsb_wd = nsize_sb_now
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            write(nfout,9001) (w1hw2_(ib2,ip0),ip0=1,ib2)
         end do
      else
         do ib2=1,nsb_wd
            write(nfout,9001) (w1hw2_(2*ib2-1,ip0),ip0=1,ib2)
         end do
         do ib2=1,nsb_wd
            write(nfout,9001) (w1hw2_(2*ib2  ,ip0),ip0=1,ib2)
         end do
      end if
      write(nfout,*) 'w1sw2 for ik = ',ik
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            write(nfout,9001) (w1sw2_(ib2,ip0),ip0=1,ib2)
         end do
      else
         do ib2=1,nsb_wd
            write(nfout,9001) (w1sw2_(2*ib2-1,ip0),ip0=1,ib2)
         end do
         do ib2=1,nsb_wd
            write(nfout,9001) (w1sw2_(2*ib2  ,ip0),ip0=1,ib2)
         end do
      end if
9001  format(5x,9f12.5)
      write(nfout,*) 'eko_l for ik = ',ik
      write(nfout,9001) (eko_l(nsta+ib1-1,ik),ib1=1,nel_wd)
    end subroutine wd_w1hw2
#else
    subroutine wd_w1hw2(somecomment,iblock)
      character(len=*), intent(in) :: somecomment
      integer,intent(in) :: iblock
      integer :: ib1, ib2, nel_wd, nsb_wd
      write(nfout,'(a35)') somecomment
      write(nfout,*) 'w1hw2 for ik = ',ik
      nel_wd = 8
      nsb_wd = 8
      if(nel_wd > nel) nel_wd = nel
      if(nsb_wd > nsize_sb_now) nsb_wd = nsize_sb_now
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      else
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(2*ip0-1),ip0=ip0b+1,ip0b+ib2)
         end do
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1hw2(2*ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      end if
      write(nfout,*) 'w1sw2 for ik = ',ik
      if(kimg_t==1) then
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      else
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(2*ip0-1),ip0=ip0b+1,ip0b+ib2)
         end do
         do ib2=1,nsb_wd
            ip0b = ib2*(ib2-1)/2
            write(nfout,9001) (w1sw2(2*ip0),ip0=ip0b+1,ip0b+ib2)
         end do
      end if
9001  format(5x,9f12.5)
      write(nfout,*) 'eko_l for ik = ',ik
      write(nfout,9001) (eko_l(nsta+ib1-1,ik),ib1=1,nel_wd)
    end subroutine wd_w1hw2
#endif

    subroutine add_nonlocal_part
      integer :: ip,ib1,ib2,ibb1,ibb2
      integer       :: ia, lmt1, lmt2, it, p, s, ib
      real(kind=DP) :: facv,facq,vr,vi,qr,qi
      real(kind=DP) :: tmpr,tmpi
#ifdef NEC_TUNE
      real(kind=DP), dimension(nlmta,nel,itot+1) :: fsr_t, fsr_v, fsr_q
      real(kind=DP), dimension(nlmta,nel,itot+1) :: fsi_t, fsi_v, fsi_q
      real(kind=DP), allocatable, dimension(:,:) :: fsr_tt, fsr_vt, fsr_qt
      real(kind=DP), allocatable, dimension(:,:) :: fsi_tt, fsi_vt, fsi_qt
      integer :: iter, ncount, i,itot0
      itot0 = itot
      if(sw_MRCV_only==ON) itot0 = 1
      if(kimg == 1) then
         fsr_v = 0.0d0
         fsr_q = 0.0d0
         fsi_v = 0.0d0
         fsi_q = 0.0d0
         do iter = 1, itot0+1
            ncount = 0
            do ia = ista_atm, iend_atm
               it = ityp(ia)
               do lmt1 = 1, ilmt(it)
                  ncount = ncount + 1
                  p = lmta(lmt1,ia)
                  do ibb1 = 1, nel
                     ibb2 = nsta+ibb1-1
                     fsr_t(ncount,ibb1,iter) = fsr(ibb2,p,iter)
                     fsi_t(ncount,ibb1,iter) = fsi(ibb2,p,iter)
                  end do ! ibb1
                  do lmt2 = 1, ilmt(it)
                     s = lmta(lmt2,ia)
                     if(ipaw(it) == 0)then
                        facv   = iwei(ia)*(dion(lmt1,lmt2,it) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     else
                        facv   = iwei(ia)*(dion_paw(lmt1,lmt2,ispin,ia) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     endif
                     facq   = iwei(ia)*q(lmt1,lmt2,it)
                     do ibb1 = 1, nel
                        ibb2 = nsta+ibb1-1
                        fsr_v(ncount,ibb1,iter) = fsr_v(ncount,ibb1,iter) + facv*fsr(ibb2,s,iter)
                        fsr_q(ncount,ibb1,iter) = fsr_q(ncount,ibb1,iter) + facq*fsr(ibb2,s,iter)
                        fsi_v(ncount,ibb1,iter) = fsi_v(ncount,ibb1,iter) + facv*fsi(ibb2,s,iter)
                        fsi_q(ncount,ibb1,iter) = fsi_q(ncount,ibb1,iter) + facq*fsi(ibb2,s,iter)
                     end do ! ibb1
                  end do ! lmt2
               end do ! lmt1
            end do ! ia
         end do ! iter
         allocate(fsr_tt(ncount,nsize_sb_now))
         allocate(fsr_vt(ncount,nsize_sb_now))
         allocate(fsr_qt(ncount,nsize_sb_now))
         allocate(fsi_tt(ncount,nsize_sb_now))
         allocate(fsi_vt(ncount,nsize_sb_now))
         allocate(fsi_qt(ncount,nsize_sb_now))
         num = 0
         do ibb1 = 1, nsize_max_sb_now
            if(ibover(ibb1,iblock)<0) cycle
            iter1=(ibb1-1)/nel+1
            ii1  = ibb1-nel*(iter1-1)
            num = num + 1
            fsr_tt(1:ncount,num) = fsr_t(1:ncount,ii1,iter1)
            fsr_vt(1:ncount,num) = fsr_v(1:ncount,ii1,iter1)
            fsr_qt(1:ncount,num) = fsr_q(1:ncount,ii1,iter1)
            fsi_tt(1:ncount,num) = fsi_t(1:ncount,ii1,iter1)
            fsi_vt(1:ncount,num) = fsi_v(1:ncount,ii1,iter1)
            fsi_qt(1:ncount,num) = fsi_q(1:ncount,ii1,iter1)
         end do
         ldc = nsize_sb_now
         if(ncount>0)then
            call dgemm('T','N',num,num,ncount,1.0d0,fsr_tt,ncount,fsr_vt,ncount,1.0d0,w1hw2_,ldc)
            call dgemm('T','N',num,num,ncount,1.0d0,fsi_tt,ncount,fsi_vt,ncount,1.0d0,w1hw2_,ldc)
            call dgemm('T','N',num,num,ncount,1.0d0,fsr_tt,ncount,fsr_qt,ncount,1.0d0,w1sw2_,ldc)
            call dgemm('T','N',num,num,ncount,1.0d0,fsi_tt,ncount,fsi_qt,ncount,1.0d0,w1sw2_,ldc)
         endif
         deallocate(fsr_tt)
         deallocate(fsr_vt)
         deallocate(fsr_qt)
         deallocate(fsi_tt)
         deallocate(fsi_vt)
         deallocate(fsi_qt)
      else ! if(kimg == 1)
         if(k_symmetry(ik) == GAMMA) then
            fsr_v = 0.0d0
            fsr_q = 0.0d0
            do iter = 1, itot0+1
               ncount = 0
               do ia = ista_atm, iend_atm
                  it = ityp(ia)
                  do lmt1 = 1, ilmt(it)
                     ncount = ncount + 1
                     p = lmta(lmt1,ia)
                     do ibb1 = 1, nel
                        ibb2 = nsta+ibb1-1
                        fsr_t(ncount,ibb1,iter) = fsr(ibb2,p,iter)
                     end do ! ibb1
                     do lmt2 = 1, ilmt(it)
                        s = lmta(lmt2,ia)
                        if(ipaw(it) == 0)then
                           facv   = iwei(ia)*(dion(lmt1,lmt2,it) + vlhxcQ(lmt1,lmt2,ia,ispin))
                        else
                           facv   = iwei(ia)*(dion_paw(lmt1,lmt2,ispin,ia) + vlhxcQ(lmt1,lmt2,ia,ispin))
                        endif
                        facq   = iwei(ia)*q(lmt1,lmt2,it)
                        do ibb1 = 1, nel
                           ibb2 = nsta+ibb1-1
                           fsr_v(ncount,ibb1,iter) = fsr_v(ncount,ibb1,iter) + facv*fsr(ibb2,s,iter)
                           fsr_q(ncount,ibb1,iter) = fsr_q(ncount,ibb1,iter) + facq*fsr(ibb2,s,iter)
                        end do ! ibb1
                     end do ! lmt2
                  end do ! lmt1
               end do ! ia
            end do ! iter
            allocate(fsr_tt(ncount,nsize_sb_now))
            allocate(fsr_vt(ncount,nsize_sb_now))
            allocate(fsr_qt(ncount,nsize_sb_now))
            num = 0
            do ibb1 = 1, nsize_max_sb_now
               if(ibover(ibb1,iblock)<0) cycle
               iter1=(ibb1-1)/nel+1
               ii1  = ibb1-nel*(iter1-1)
               num = num + 1
               fsr_tt(1:ncount,num) = fsr_t(1:ncount,ii1,iter1)
               fsr_vt(1:ncount,num) = fsr_v(1:ncount,ii1,iter1)
               fsr_qt(1:ncount,num) = fsr_q(1:ncount,ii1,iter1)
            end do
            ldc = nsize_sb_now
            if(ncount>0) then
               call dgemm('T','N',num,num,ncount,1.0d0,fsr_tt,ncount,fsr_vt,ncount,1.0d0,w1hw2_,ldc)
               call dgemm('T','N',num,num,ncount,1.0d0,fsr_tt,ncount,fsr_qt,ncount,1.0d0,w1sw2_,ldc)
            end if
            deallocate(fsr_tt)
            deallocate(fsr_vt)
            deallocate(fsr_qt)
         else ! if(k_symmetry(ik) == GAMMA)
            fsr_v = 0.0d0
            fsr_q = 0.0d0
            fsi_v = 0.0d0
            fsi_q = 0.0d0
            do iter = 1, itot0+1
               ncount = 0
               do ia = ista_atm, iend_atm
                  it = ityp(ia)
                  do lmt1 = 1, ilmt(it)
                     ncount = ncount + 1
                     p = lmta(lmt1,ia)
                     do ibb1 = 1, nel
                        ibb2 = nsta+ibb1-1
                        fsr_t(ncount,ibb1,iter) = fsr(ibb2,p,iter)
                        fsi_t(ncount,ibb1,iter) = fsi(ibb2,p,iter)
                     end do ! ibb1
                     do lmt2 = 1, ilmt(it)
                        s = lmta(lmt2,ia)
                        if(ipaw(it) == 0)then
                           facv   = iwei(ia)*(dion(lmt1,lmt2,it) + vlhxcQ(lmt1,lmt2,ia,ispin))
                        else
                           facv   = iwei(ia)*(dion_paw(lmt1,lmt2,ispin,ia) + vlhxcQ(lmt1,lmt2,ia,ispin))
                        endif
                        facq   = iwei(ia)*q(lmt1,lmt2,it)
                        do ibb1 = 1, nel
                           ibb2 = nsta+ibb1-1
                           fsr_v(ncount,ibb1,iter) = fsr_v(ncount,ibb1,iter) + facv*fsr(ibb2,s,iter)
                           fsr_q(ncount,ibb1,iter) = fsr_q(ncount,ibb1,iter) + facq*fsr(ibb2,s,iter)
                           fsi_v(ncount,ibb1,iter) = fsi_v(ncount,ibb1,iter) + facv*fsi(ibb2,s,iter)
                           fsi_q(ncount,ibb1,iter) = fsi_q(ncount,ibb1,iter) + facq*fsi(ibb2,s,iter)
                        end do ! ibb1
                     end do ! lmt2
                  end do ! lmt1
               end do ! ia
            end do ! iter
            allocate(fsr_tt(ncount*kimg,nsize_sb_now))
            allocate(fsr_vt(ncount*kimg,nsize_sb_now))
            allocate(fsr_qt(ncount*kimg,nsize_sb_now))
            num = 0
            do ibb1 = 1, nsize_max_sb_now
               if(ibover(ibb1,iblock)<0) cycle
               iter1=(ibb1-1)/nel+1
               ii1  = ibb1-nel*(iter1-1)
               num = num + 1
               do i = 1, ncount
                  fsr_tt(i*2-1,num) = fsr_t(i,ii1,iter1)
                  fsr_tt(i*2,  num) = fsi_t(i,ii1,iter1)
                  fsr_vt(i*2-1,num) = fsr_v(i,ii1,iter1)
                  fsr_vt(i*2,  num) = fsi_v(i,ii1,iter1)
                  fsr_qt(i*2-1,num) = fsr_q(i,ii1,iter1)
                  fsr_qt(i*2,  num) = fsi_q(i,ii1,iter1)
               end do
            end do
            ldc = nsize_sb_now
            if(ncount>0)then
               call zgemm('C','N',num,num,ncount,c1,fsr_tt,ncount,fsr_vt,ncount,c1,w1hw2_,ldc)
               call zgemm('C','N',num,num,ncount,c1,fsr_tt,ncount,fsr_qt,ncount,c1,w1sw2_,ldc)
            endif
            deallocate(fsr_tt)
            deallocate(fsr_vt)
            deallocate(fsr_qt)
         end if ! if(k_symmetry(ik) == GAMMA)
      end if ! if(kimg == 1)
#else
      do ibb2 = 1,nsize_max_sb_now
         if(ibover(ibb2,iblock)<0) cycle
         ib2 = ibover(ibb2,iblock)
         iter2= (ibb2-1)/nel+1
         ii2  = ibb2-nel*(iter2-1)
         ii2  = nsta+ii2-1
         ip0b = ib2*(ib2-1)/2
         do ibb1 = 1,ibb2
            if(ibover(ibb1,iblock)<0) cycle
            ib1 = ibover(ibb1,iblock)
            iter1=(ibb1-1)/nel+1
            ii1  = ibb1-nel*(iter1-1)
            ii1  = nsta+ii1-1
            ip0 = ip0b + ib1
!            if(mod(ip0-1,nrank_e)/=myrank_e) cycle
            if(kimg_t==1) then
               vr=0.d0
               qr=0.d0
            else
               vr=0.d0
               vi=0.d0
               qr=0.d0
               qi=0.d0
            end if
            do ia = 1, natm
               it = ityp(ia)
               do lmt1 = 1, ilmt(it)
                  p = lmta(lmt1,ia)
                  do lmt2 = 1, ilmt(it)
                     s = lmta(lmt2,ia)
                     if(ipaw(it)==0)then
                        facv   = iwei(ia)*(dion(lmt1,lmt2,it) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     else
                        facv   = iwei(ia)*(dion_paw(lmt1,lmt2,ispin,ia) + vlhxcQ(lmt1,lmt2,ia,ispin))
                     endif
                     facq   = iwei(ia)*q(lmt1,lmt2,it)
                     if(kimg==1) then
                        tmpr = fsr(ii1,p,iter1)*fsr(ii2,s,iter2)&
                    &        + fsi(ii1,p,iter1)*fsi(ii2,s,iter2)
                        vr = vr + facv*tmpr
                        qr = qr + facq*tmpr
                     else
                        if(k_symmetry(ik) == GAMMA) then
                           tmpr = fsr(ii1,p,iter1)*fsr(ii2,s,iter2)
                           vr = vr + facv*tmpr
                           qr = qr + facq*tmpr
                        else
                           tmpr = fsr(ii1,p,iter1)*fsr(ii2,s,iter2)&
                    &        + fsi(ii1,p,iter1)*fsi(ii2,s,iter2)
                           tmpi = fsr(ii1,p,iter1)*fsi(ii2,s,iter2)&
                    &        - fsi(ii1,p,iter1)*fsr(ii2,s,iter2)
                           vr = vr + facv*tmpr
                           vi = vi + facv*tmpi
                           qr = qr + facq*tmpr
                           qi = qi + facq*tmpi
                        end if
                     end if
                  end do
               end do
            end do
            if(kimg_t==1) then
               w1hw2(ip0) = w1hw2(ip0) + vr
               w1sw2(ip0) = w1sw2(ip0) + qr
            else
               w1hw2(2*ip0-1) = w1hw2(2*ip0-1) + vr
               w1hw2(2*ip0  ) = w1hw2(2*ip0  ) + vi
               w1sw2(2*ip0-1) = w1sw2(2*ip0-1) + qr
               w1sw2(2*ip0  ) = w1sw2(2*ip0  ) + qi
            end if
         end do
      end do
#endif
    end subroutine add_nonlocal_part


    subroutine subspace_rotation
      integer :: ib1,ib2,ibb2,iadd,is,is1
!!$      real(kind=DP), dimension(np_g1k_x,neg,kimg) :: zaj_wk
      real(kind=DP), allocatable, dimension(:,:,:) :: zaj_wk
      real(kind=DP), allocatable, dimension(:,:,:) :: zah_wk
      real(kind=DP), allocatable, dimension(:,:)   :: fsr_wk
      real(kind=DP), allocatable, dimension(:,:)   :: fsi_wk
#ifdef NEC_TUNE
      integer :: i
      real(kind=DP), allocatable, dimension(:,:)   :: fsr_t, fsi_t

      if(kimg==1) then
         allocate(zaj_wk(maxval(np_g1k),nel,kimg))
         if(itot /= max_iter_david) allocate(zah_wk(maxval(np_g1k),nel,kimg))
         allocate(fsr_wk(nel,np_fs),fsi_wk(nel,np_fs))
         allocate(fsr_t(nsize_sb_now,np_fs),fsi_t(nsize_sb_now,np_fs))
         num = 0
         do ibb2 = 1, nsize_max_sb_now
            if(ibover(ibb2,iblock) < 0) cycle
            iter2=(ibb2-1)/nel+1
            ii2=ibb2-nel*(iter2-1)
            ii2=nsta+ii2-1
            num = num + 1
            do ii = ista_fs, iend_fs
               i = ii - ista_fs + 1
               fsr_t(num,i) = fsr(ii2,ii,iter2)
               fsi_t(num,i) = fsi(ii2,ii,iter2)
            end do
         end do
         call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zat_t,maxval(np_g1k),vec,nsize_sb_now,0.0d0,zaj_wk,maxval(np_g1k))
         call dgemm('T','N',nel,np_fs,nsize_sb_now,1.0d0,vec,nsize_sb_now,fsr_t,nsize_sb_now,0.0d0,fsr_wk,nel)
         call dgemm('T','N',nel,np_fs,nsize_sb_now,1.0d0,vec,nsize_sb_now,fsi_t,nsize_sb_now,0.0d0,fsi_wk,nel)
         if(itot /= max_iter_david) then
            call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zah_t,maxval(np_g1k),vec,nsize_sb_now,0.0d0,zah_wk,maxval(np_g1k))
         end if
         zaj_l(:,nsta:nend,ik,:) = zaj_wk(:,:,:)
#ifdef SAVE_FFT_TIMES
         if(sw_save_fft == ON) status_saved_phifftr(nsta:nend,ik) = OLD
#endif
         if(itot /= max_iter_david) zah_l(:,nsta:nend,:,1) = zah_wk(:,:,:)
         do is = 1, np_fs ! MPI
            do ib = nsta, nend ! MPI
               fsr_l(ib,is,ik) = fsr_wk(ib-nsta+1,is)
               fsi_l(ib,is,ik) = fsi_wk(ib-nsta+1,is)
            end do
         end do
         deallocate(fsr_t,fsi_t)
         deallocate(zaj_wk,fsr_wk,fsi_wk)
         if(itot /= max_iter_david) deallocate(zah_wk)
      else
         if(k_symmetry(ik) == GAMMA) then
            allocate(zaj_wk(maxval(np_g1k),nel,kimg))
            if(itot /= max_iter_david) allocate(zah_wk(maxval(np_g1k),nel,kimg))
            allocate(fsr_wk(nel,np_fs))
            allocate(fsr_t(nsize_sb_now,np_fs))
            num = 0
            do ibb2 = 1, nsize_max_sb_now
               if(ibover(ibb2,iblock) < 0) cycle
               iter2=(ibb2-1)/nel+1
               ii2=ibb2-nel*(iter2-1)
               ii2=nsta+ii2-1
               num = num + 1
               do ii = ista_fs, iend_fs
                  i = ii - ista_fs + 1
                  fsr_t(num,i) = fsr(ii2,ii,iter2)
               end do
            end do
            call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zat_t(1,1,1),   maxval(np_g1k), &
            & vec,nsize_sb_now,0.0d0,zaj_wk(1,1,1),   maxval(np_g1k))
            call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zat_t(1,1,kimg),maxval(np_g1k), &
            & vec,nsize_sb_now,0.0d0,zaj_wk(1,1,kimg),maxval(np_g1k))
            call dgemm('T','N',nel,np_fs,nsize_sb_now,1.0d0,vec,nsize_sb_now,fsr_t,nsize_sb_now,0.0d0,fsr_wk,nel)
            if(itot /= max_iter_david) then
               call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zah_t(1,1,1),   maxval(np_g1k), &
            & vec,nsize_sb_now,0.0d0,zah_wk(1,1,1),   maxval(np_g1k))
               call dgemm('N','N',np_g1k(ik),nel,nsize_sb_now,1.0d0,zah_t(1,1,kimg),maxval(np_g1k), &
            & vec,nsize_sb_now,0.0d0,zah_wk(1,1,kimg),maxval(np_g1k))
            end if
            zaj_l(:,nsta:nend,ik,:) = zaj_wk(:,:,:)
#ifdef SAVE_FFT_TIMES
            if(sw_save_fft == ON) status_saved_phifftr(nsta:nend,ik) = OLD
#endif
            if(itot /= max_iter_david) zah_l(:,nsta:nend,:,1) = zah_wk(:,:,:)
            do is = 1, np_fs ! MPI
               do ib = nsta, nend ! MPI
                  fsr_l(ib,is,ik) = fsr_wk(ib-nsta+1,is)
               end do
            end do
            deallocate(fsr_t)
            deallocate(zaj_wk,fsr_wk)
            if(itot /= max_iter_david) deallocate(zah_wk)
         else
            allocate(zaj_wk(maxval(np_g1k)*kimg,nel,1))
            if(itot /= max_iter_david) allocate(zah_wk(maxval(np_g1k)*kimg,nel,1))
            allocate(fsr_wk(nel*kimg,np_fs))
            allocate(fsr_t(nsize_sb_now*kimg,np_fs))
            num = 0
            do ibb2 = 1, nsize_max_sb_now
               if(ibover(ibb2,iblock) < 0) cycle
               iter2=(ibb2-1)/nel+1
               ii2=ibb2-nel*(iter2-1)
               ii2=nsta+ii2-1
               num = num + 1
               do ii = ista_fs, iend_fs
                  i = ii - ista_fs + 1
                  fsr_t(2*num-1,i) = fsr(ii2,ii,iter2)
                  fsr_t(2*num,  i) = fsi(ii2,ii,iter2)
               end do
            end do
            call zgemm('N','N',np_g1k(ik),nel,nsize_sb_now,c1,zat_t,maxval(np_g1k),vec,nsize_sb_now,c0,zaj_wk,maxval(np_g1k))
            call zgemm('T','N',nel,np_fs,nsize_sb_now,c1,vec,nsize_sb_now,fsr_t,nsize_sb_now,c0,fsr_wk,nel)
            if(itot /= max_iter_david) then
               call zgemm('N','N',np_g1k(ik),nel,nsize_sb_now,c1,zah_t,maxval(np_g1k),vec,nsize_sb_now,c0,zah_wk,maxval(np_g1k))
            end if
            do ib = nsta, nend
               i = ib - nsta + 1
               do ii = 1, np_g1k(ik)
                  zaj_l(ii,ib,ik,1)    = zaj_wk(2*ii-1,i,1)
                  zaj_l(ii,ib,ik,kimg) = zaj_wk(2*ii,  i,1)
               end do
            end do
#ifdef SAVE_FFT_TIMES
            if(sw_save_fft == ON) status_saved_phifftr(nsta:nend,ik) = OLD
#endif
            if(itot /= max_iter_david) then
               do ib = nsta, nend
                  i = ib - nsta + 1
                  do ii = 1, np_g1k(ik)
                     zah_l(ii,ib,1,   1) = zah_wk(2*ii-1,i,1)
                     zah_l(ii,ib,kimg,1) = zah_wk(2*ii,  i,1)
                  end do
               end do
            end if
            do is = 1, np_fs ! MPI
               do ib = nsta, nend ! MPI
                  i = ib - nsta + 1
                  fsr_l(ib,is,ik) = fsr_wk(2*i-1,is)
                  fsi_l(ib,is,ik) = fsr_wk(2*i,  is)
               end do
            end do
            deallocate(fsr_t)
            deallocate(zaj_wk,fsr_wk)
            if(itot /= max_iter_david) deallocate(zah_wk)
         end if
      end if
#else
      allocate(zaj_wk(maxval(np_g1k),nel,kimg))
      if(itot /= max_iter_david) then
        allocate(zah_wk(maxval(np_g1k),nel,kimg))
        zah_wk(:,:,:) = 0.d0
      end if
      allocate(fsr_wk(nel,nlmta))
      if(k_symmetry(ik) /= GAMMA) then
        allocate(fsi_wk(nel,nlmta))
        fsi_wk(:,:)=0.d0
      end if

      zaj_wk(:,:,:) = 0.d0
      fsr_wk(:,:)=0.d0
      if(kimg==1) then
         do ib1=1,nel
            do ibb2=1,nsize_max_sb_now
               if(ibover(ibb2,iblock)<0) cycle
               ib2 = ibover(ibb2,iblock)
               iter2=(ibb2-1)/nel+1
               ii2=ibb2-nel*(iter2-1)
               ii2=nsta+ii2-1
               hr2=vec(ib2,ib1)
               do ii=ista_g1k(ik),iend_g1k(ik)
                  iadd = ii - ista_g1k(ik) + 1
                  zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + zat_l(iadd,ii2,kimg,iter2)*hr2
               end do
               do ii=1,nlmta
                 fsr_wk(ib1,ii) = fsr_wk(ib1,ii) + fsr(ii2,ii,iter2)*hr2
                 fsi_wk(ib1,ii) = fsi_wk(ib1,ii) + fsi(ii2,ii,iter2)*hr2
               end do
               if(itot /= max_iter_david) then
                 do ii=ista_g1k(ik),iend_g1k(ik)
                    iadd = ii - ista_g1k(ik) + 1
                    zah_wk(iadd,ib1,kimg) = zah_wk(iadd,ib1,kimg) + zah_l(iadd,ii2,kimg,iter2)*hr2
                 end do
               end if
            end do
         end do
      else
         if(k_symmetry(ik) == GAMMA) then
            do ib1=1,nel
               do ibb2=1,nsize_max_sb_now
                  if(ibover(ibb2,iblock)<0) cycle
                  ib2 = ibover(ibb2,iblock)
                  iter2=(ibb2-1)/nel+1
                  ii2=ibb2-nel*(iter2-1)
                  ii2=nsta+ii2-1
                  hr2=vec(ib2,ib1)
                  do ii=ista_g1k(ik),iend_g1k(ik)
                     iadd = ii - ista_g1k(ik) + 1
                     dr1=zat_l(iadd,ii2,1   ,iter2)
                     di1=zat_l(iadd,ii2,kimg,iter2)
                     zaj_wk(iadd,ib1,1   ) = zaj_wk(iadd,ib1,1   ) + dr1*hr2
                     zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + di1*hr2
                  end do
                  do ii=1,nlmta
                    fsr_wk(ib1,ii) = fsr_wk(ib1,ii) + fsr(ii2,ii,iter2)*hr2
                  end do
                  if(itot /= max_iter_david) then
                    do ii=ista_g1k(ik),iend_g1k(ik)
                       iadd = ii - ista_g1k(ik) + 1
                       dr1=zah_l(iadd,ii2,1   ,iter2)
                       di1=zah_l(iadd,ii2,kimg,iter2)
                       zah_wk(iadd,ib1,1   ) = zah_wk(iadd,ib1,1   ) + dr1*hr2
                       zah_wk(iadd,ib1,kimg) = zah_wk(iadd,ib1,kimg) + di1*hr2
                    end do
                  end if
               end do
            end do
         else
            do ib1=1,nel
               do ibb2=1,nsize_max_sb_now
                  if(ibover(ibb2,iblock)<0) cycle
                  ib2 = ibover(ibb2,iblock)
                  iter2=(ibb2-1)/nel+1
                  ii2=ibb2-nel*(iter2-1)
                  ii2=nsta+ii2-1
                  hr2=vec(2*ib2-1,ib1)
                  hi2=vec(2*ib2  ,ib1)
                  do ii=ista_g1k(ik),iend_g1k(ik)
                     iadd = ii - ista_g1k(ik) + 1
                     dr1=zat_l(iadd,ii2,1   ,iter2)
                     di1=zat_l(iadd,ii2,kimg,iter2)
                     zaj_wk(iadd,ib1,1   ) = zaj_wk(iadd,ib1,1   ) + dr1*hr2 - di1*hi2
                     zaj_wk(iadd,ib1,kimg) = zaj_wk(iadd,ib1,kimg) + dr1*hi2 + di1*hr2
                  end do
                  do ii=1,nlmta
                     dr1=fsr(ii2,ii,iter2)
                     di1=fsi(ii2,ii,iter2)
                     fsr_wk(ib1,ii) = fsr_wk(ib1,ii) + dr1*hr2 - di1*hi2
                     fsi_wk(ib1,ii) = fsi_wk(ib1,ii) + dr1*hi2 + di1*hr2
                  end do
                  if(itot /= max_iter_david) then
                    do ii=ista_g1k(ik),iend_g1k(ik)
                       iadd = ii - ista_g1k(ik) + 1
                       dr1=zah_l(iadd,ii2,1   ,iter2)
                       di1=zah_l(iadd,ii2,kimg,iter2)
                       zah_wk(iadd,ib1,1   ) = zah_wk(iadd,ib1,1   ) + dr1*hr2 - di1*hi2
                       zah_wk(iadd,ib1,kimg) = zah_wk(iadd,ib1,kimg) + dr1*hi2 + di1*hr2
                    end do
                  end if
               end do
            end do
         end if
      end if
!print *,itot,  itot /= max_iter_david
      zaj_l(:,nsta:nend,ik,:) = zaj_wk(:,:,:)
#ifdef SAVE_FFT_TIMES
      if(sw_save_fft == ON) status_saved_phifftr(nsta:nend,ik) = OLD
#endif
      if(itot /= max_iter_david) zah_l(:,nsta:nend,:,1) = zah_wk(:,:,:)
!     fsr_l(nsta:nend,:,ik) = fsr_wk(:,:)
      do is = 1, np_fs ! MPI
         is1=nis_fs(myrank_g)+is-1
         do ib = nsta, nend ! MPI
             fsr_l(ib,is,ik) = fsr_wk(ib-nsta+1,is1)
         end do
      end do
!     if(k_symmetry(ik) /= GAMMA) fsi_l(nsta:nend,:,ik) = fsi_wk(:,:)
      if(k_symmetry(ik) /= GAMMA) then
         do is = 1, np_fs ! MPI
            is1=nis_fs(myrank_g)+is-1
            do ib = nsta, nend ! MPI
               fsi_l(ib,is,ik) = fsi_wk(ib-nsta+1,is1)
            end do
         end do
      end if

      deallocate(zaj_wk,fsr_wk)
      if(itot /= max_iter_david) deallocate(zah_wk)
      if(k_symmetry(ik) /= GAMMA) deallocate(fsi_wk)
#endif
    end subroutine subspace_rotation

  end subroutine evolve_WFs_in_subspace_3D

  subroutine decide_correction_vector_3D(precon,ik,ispin,ekin_l, &
                                         ibsize,p_l,itot, &
                                         lsize, bfft_l, wk_bfft_l ,afft_l )
    integer, intent(in)       :: precon, ik, ispin, ibsize
    real(kind=DP), intent(in)  :: ekin_l(maxval(np_g1k))
    integer, intent(in) :: lsize
#ifdef FFT_3D_DIVISION
    real(kind=DP), intent(in), optional  :: afft_l(lsize*2   )
    real(kind=DP), intent(out), optional :: bfft_l(lsize*2   ,1)
    real(kind=DP), intent(inout), optional :: wk_bfft_l(lsize*2   ,ibsize)
#else
    real(kind=DP), intent(in), optional  :: afft_l(lsize*kimg)
!!!$    real(kind=DP), intent(out) :: bfft_l(lsize*kimg,1)
    real(kind=DP), intent(out), optional :: bfft_l(lsize*kimg,ibsize)
    real(kind=DP), intent(inout), optional :: wk_bfft_l(lsize*kimg,ibsize)
#endif
    integer,       intent(in)  :: itot
    real(kind=DP)              :: p_l(maxval(np_g1k))
    integer :: ib,ib1,ib2,ibesize, iflag
    integer :: nsize
    integer :: isrsize, fft_l_size
#ifdef MPI_FFTW
    integer(C_INTPTR_T)  :: local_n, local_n_offset, alloc_local, lx, ly, lz, mx,my,mz
#endif
    real(kind=DP), allocatable, dimension(:,:) :: vexx
    real(kind=DP), allocatable, dimension(:,:) :: vtau_phl

    integer :: ng
    integer :: id_sname = -1, id_sname1 = -1, id_sname2 = -1, id_sname3 = -1, id_sname4 = -1, id_sname5 = -1
    call tstatc0_begin('decide_correction_vector_3D(0) ', id_sname,1)

    isrsize = min(lsize,mp_g1k(ik))
#ifdef MPI_FFTW
    if(sw_mpi_fftw==ON) then
      fft_l_size  = lsize
    else
      fft_l_size  = nel_fft_x(myrank_g)
    endif
#else
    fft_l_size  = nel_fft_x(myrank_g)
#endif
    if(sw_hybrid_functional==ON) then
        ng = maxval(np_g1k)
        allocate(vexx(ng,kimg))
        vexx = 0.0d0
    else
        ng = 1
        allocate(vexx(ng,kimg))
        vexx = 0.0d0
     endif
    if ( use_metagga .and. vtau_exists ) then
       allocate( vtau_phl(maxval(np_g1k),kimg) );  vtau_phl = 0.0d0
    endif

#ifdef MPI_FFTW
!    if(itot==1 .and. sw_keep_hloc_phi==OFF .and. sw_mpi_fftw==ON) then
    if (sw_mpi_fftw==ON) then
      lx = fft_box_size_WF(1,0)
      ly = fft_box_size_WF(2,0)
      lz = fft_box_size_WF(3,0)
      if(kimg==2) then
        alloc_local = fftw_mpi_local_size_3d(ly,lz,lx,mpi_ke_world,local_n,local_n_offset)
      else
        alloc_local = fftw_mpi_local_size_3d(ly,lz,lx/2,mpi_ke_world,local_n,local_n_offset)
      endif
    endif
#endif

!    iflag = 0
!    if ( use_metagga .and. vtau_exists ) iflag = 1
    do ib1 = 1, np_e, ibsize
       if(sw_hybrid_functional==ON) then
          if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
             call m_ES_EXX_potential(nfout,ispin,ib1,ik,ng,fsr_l(ib1,:,ik),fsi_l(ib1,:,ik),vexx,0)
          else
             call m_ES_EXX_potential(nfout,ispin,ib1,ik,ng,fsr_l(ib1,:,ik),fsr_l(ib1,:,ik),vexx,0)
          endif
       endif

       if ( use_metagga .and. vtau_exists ) then
#ifdef MPI_FFTW
          if ( sw_mpi_fftw == ON ) then
             call m_ES_kindens_to_vnlph_ib2_mpfw( ispin, ik, ib1, lsize, &
                  &             lx, ly, lz, local_n, cfft_mpifftw_vlocal, vtau_phl )
          else
             call m_ES_kindens_to_vnlph_ib2( ispin, ik, ib1, lsize, cfft_l, vtau_phl )
          endif
#else
          call m_ES_kindens_to_vnlph_ib2( ispin, ik, ib1, lsize, cfft_l, vtau_phl )
#endif
       endif

!       if((itot==1 .and. sw_keep_hloc_phi==OFF) .or. iflag==1) then
       if( itot==1 .and. sw_keep_hloc_phi==OFF ) then
         ib2 = min(ib1+ibsize-1,np_e)
         ibesize = ib2 - ib1 + 1

#ifdef __TIMER_COMM__
!!$       ib = ib1
         call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l,0)
#else
#ifdef MPI_FFTW
         if(sw_mpi_fftw==ON) then
           call m_ES_WF_in_Rspace_mpifftw(ista_k,iend_k,ik,ib1,zaj_l)
         else
           call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
         endif
#else
         call m_ES_WF_in_Rspace_3D(ik,ib1,ib2,ibsize,lsize,wk_bfft_l)
#endif
#endif
#ifdef FFT_3D_DIVISION
         call m_FFT_Vlocal_W_3DIV_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_x(myrank_g))
         call m_FFT_Direct_3DIV_3D(nfout,wk_bfft_l,lsize,ibsize)
#else
#ifdef MPI_FFTW
         if(sw_mpi_fftw==ON) then
           nsize = local_n*lx*ly
           call m_FFT_Vlocal_W_mpifftw3d(afft_mpifftw_vlocal,lx,local_n,lz)
           call m_FFT_Direct_MPI_FFTW(nfout)
         else
           call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
           call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
         endif
#else
         if (sw_fft_xzy > 0) then
            call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_y(myrank_g))
            call m_FFT_Direct_3D(nfout,wk_bfft_l,lsize,ibsize)
         else
            call m_FFT_Vlocal_W_3D(afft_l,wk_bfft_l,lsize,ibsize,nel_fft_z(myrank_g))
            if(sw_serial_fft == ON) then
               call m_ES_WF_2D(ik,wk_bfft_l,ib2,ib1,ibsize,lsize,DIRECT)
            else
               call m_FFT_Direct_XYZ_3D(nfout,wk_bfft_l,lsize,ibsize)
            endif
         end if
#endif
#endif
#ifdef MPI_FFTW
         if(sw_mpi_fftw==ON) then
            call map_fft_to_WF_mpifftw(ik,lsize,ibesize,bfft_l,isrsize,fft_l_size)
         else
            call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
         endif
#else
         call map_fft_to_WF_3D(ik,lsize,ibesize,wk_bfft_l,bfft_l,isrsize,fft_l_size)
#endif
         if ( use_metagga .and. vtau_exists ) then
            if(ibsize == 1) then
               call SD_direction_3D( precon, ik, ib1, ekin_l, p_l, itot, lsize, &
                    &                ng, vexx, vtau_phl=vtau_phl, VlocalW=bfft_l ) !-here
            else
               call SD_direction_3D2( precon, ik, ib1, ib2, ibesize, ekin_l, p_l, &
                    &                 itot, lsize, ng, vexx, vtau_phl=vtau_phl, &
                    &                 VlocalW=bfft_l ) !-here
            end if
         else
            if(ibsize == 1) then
               call SD_direction_3D( precon, ik, ib1, ekin_l, p_l, itot, lsize, &
                    &                ng, vexx, VlocalW=bfft_l ) !-here
            else
               call SD_direction_3D2( precon, ik, ib1, ib2, ibesize, ekin_l, p_l, &
                    &                 itot, lsize, ng, vexx, VlocalW=bfft_l ) !-here
            end if
         endif
      else
         if ( use_metagga .and. vtau_exists ) then
            if(ibsize == 1) then
               call SD_direction_3D( precon, ik, ib1, ekin_l, p_l, itot, lsize, &
                    &                ng, vexx, vtau_phl=vtau_phl ) !-here
            else
               call SD_direction_3D2( precon, ik, ib1, ib2, ibesize, ekin_l, p_l, &
                    &                 itot, lsize, ng, vexx, vtau_phl=vtau_phl ) !-here
            end if
         else
            if(ibsize == 1) then
               call SD_direction_3D(precon,ik,ib1,ekin_l,p_l,itot,lsize,ng,vexx) !-here
            else
               call SD_direction_3D2(precon,ik,ib1,ib2,ibesize,ekin_l,p_l,itot,lsize,ng,vexx) !-here
            end if
         endif
      endif
   end do
   deallocate(vexx)
   if ( allocated( vtau_phl) ) deallocate( vtau_phl )

    call mpi_barrier(mpi_k_world(myrank_k),ierr)

!    call orthogonalize_SD_drctns(ik,to=OTHER_BANDS)  ! -(m_ES_WF_by_SDorCG) ->(wfsd_l, bsd(ri)_l)
    call orthogonalize_SD_drctns(ik,to=OTHER_BANDS)
    if(ipridavidson>=2 .and. ik==1) write(nfout,'(" !### zaj_l is new,  bfft is old")')
!    call orthogonalize_SD_drctns(ik,to=ALL_BANDS)  ! -(m_ES_WF_by_SDorCG) ->(wfsd_l, bsd(ri)_l)

    call normalize_wfsd_3D(ik)

    call tstatc0_end(id_sname)
  end subroutine decide_correction_vector_3D

  subroutine orthogonalize_SD_drctns(ik,to)
    integer, intent(in) :: ik,to

    integer :: itmp
    integer :: id_sname = -1
    real(kind=DP), allocatable, dimension(:,:) :: fs_mpi
    integer :: is, is1, ib
    call tstatc0_begin('orthogonalize_SD_drctns(Mddavid) ', id_sname)

!    if(modnrm == EXECUT) call m_ES_betar_dot_Psi_4_each_k(nfout,wfsd_l,ik,ik,ik,bsdr_l,bsdi_l)
    !                                        ->bsd(ri)_l

    if(sw_MRCV_only==OFF)then
       zat_l(:,:,:,1) = zaj_l(:,:,ik,:)
!      allocate(fs_mpi(np_e,nlmta))
!!$       allocate(fs_mpi2(np_e,nlmta))
       call m_ES_gather_f_3d_to_2d_k(fsr_l(:,:,ik),  fsr(:,:,1))
       if( k_symmetry(ik) /= GAMMA ) then
         call m_ES_gather_f_3d_to_2d_k(fsi_l(:,:,ik),  fsi(:,:,1))
       endif
!      fs_mpi=0.d0
!      do is = 1, np_fs ! MPI
!         is1=nis_fs(myrank_g)+is-1
!         do ib = 1, np_e ! MPI
!            fs_mpi(ib,is1) = fsr_l(ib,is,ik)
!         end do
!      end do
!      call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!        & ,mpi_double_precision,mpi_sum, mpi_ke_world,ierr)       ! MPI
!      fsr(1:np_e,1:nlmta,1) = fs_mpi(1:np_e,1:nlmta)
!      if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
!         fs_mpi=0.d0
!         do is = 1, np_fs ! MPI
!            is1=nis_fs(myrank_g)+is-1
!            do ib = 1, np_e ! MPI
!               fs_mpi(ib,is1) = fsi_l(ib,is,ik)
!            end do
!         end do
!         call mpi_allreduce(MPI_IN_PLACE,fs_mpi,np_e*nlmta &
!           & ,mpi_double_precision,mpi_sum ,mpi_ke_world,ierr)       ! MPI
!         fsi(1:np_e,1:nlmta,1) = fs_mpi(1:np_e,1:nlmta)
!      end if
!      deallocate(fs_mpi)

       zaj_l(:,:,ik,:) = zajold_l(:,:,:)
       fsr_l(:,:,ik) = fsrold_l(:,:)
       if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
          fsi_l(:,:,ik) = fsiold_l(:,:)
       end if
#ifdef SAVE_FFT_TIMES
       if(sw_save_fft == ON) status_saved_phifftr(:,ik) = OLD
#endif
    endif

!!$    itmp=modnrm
!!$    modnrm=EXECUT
!!$    itmp=modnrm
!!$    modnrm=EXECUT
!!$    call m_ES_betar_dot_Psi_4_each_k_3D(wfsd_l,ik,ik,ik,bsdr_l,bsdi_l)
!!$    call m_ES_orthogonalize_SD_to_WFs_3D(ik,to,wfsd_l,bsdr_l,bsdi_l) ! ->(wfsd_l,bsd(ri)_l)
!!$    modnrm=itmp
    if(modnrm==EXECUT) then
       call m_ES_betar_dot_Psi_4_each_k_3D(nfout,wfsd_l,ik,ik,ik,bsdr_l,bsdi_l,map=feigconv)
       call m_ES_orthogonal_phi_to_WFs(ik,wfsd_l,bsdr_l,bsdi_l) ! ->(wfsd_l,bsd(ri)_l)
    else
       call m_ES_orthogonal_phi_to_WFs(ik,wfsd_l,bsdr_l,bsdi_l) ! ->(wfsd_l,bsd(ri)_l)
       call m_ES_betar_dot_Psi_4_each_k_3D(nfout,wfsd_l,ik,ik,ik,bsdr_l,bsdi_l,map=feigconv)
    end if
    call tstatc0_end(id_sname)
  end subroutine orthogonalize_SD_drctns

  logical function allbands_converged()
    integer :: ib,ierr
    logical :: logi
    logi = .true.
    do ib=1,np_e
      if(.not.feigconv(ib)) then
        logi = .false.
        exit
      endif
    enddo
    call mpi_allreduce(mpi_in_place,logi,1,mpi_logical,mpi_land,mpi_k_world(myrank_k),ierr)
    allbands_converged = logi
  end function allbands_converged

#endif
end module m_ES_WF_by_ModifiedDavidson

