  subroutine m_Force_term_Elocal_and_Epc_3D(nfout,pos)

    use m_Parallelization,only: ista_kngp  => ista_kngp_3D      &
                              , iend_kngp  => iend_kngp

    integer, intent(in) :: nfout
    real(kind=DP), intent(in), dimension(natm,3) :: pos 
    real(kind=DP), allocatable,dimension(:):: zfcos, zfsin
    integer :: ia, it, ipc
    integer :: id_sname = -1
    call tstatc0_begin('m_Force_term_Elocal_and_Epc ', id_sname,1)
    allocate(forcw(natm,3))
    allocate(zfsin(ista_kngp:iend_kngp))
    allocate(zfcos(ista_kngp:iend_kngp))
#ifndef REMOVE_PC_FROM_FORCE
    call m_XC_cal_potential_3D(nfout,Partial_Core_Charge,chgq_l_3D,VXC_AND_EXC)
                                               !  -> vxcpc_l
#endif
    forcw = 0.d0
    do ia = 1, natm
       it = ityp(ia); ipc = itpcc(it)
       call calc_phase2(natm,pos,ia,kgp,ngabc,ista_kngp,iend_kngp,zfcos,zfsin)
       call force_from_Elocal_4_each_atom ! -(contained here) ->forcw
       if(ipc /= 0 ) call force_from_Epc_for_each_atom    ! -> forcw
    end do

    do ia = 1, natm
       call pucv2cart(rltv,forcw(ia,1),forcw(ia,2),forcw(ia,3))
       forc_l(ia,1:3) = forc_l(ia,1:3) + forcw(ia,1:3)
    end do

    deallocate(zfcos,zfsin)
    deallocate(forcw)
    call tstatc0_end(id_sname)
  contains
    subroutine force_from_Epc_for_each_atom
      integer :: is, i
      real(kind=DP) :: tp
      real(kind=DP), dimension(3) :: f1, f2
      real(kind=DP), dimension(3) :: f1_mpi, f2_mpi ! MPI
      f1 = 0.d0
      f2 = 0.d0
      f1_mpi = 0.d0
      f2_mpi = 0.d0

#ifdef NEC_TUNE_SMP
!CDIR NOCONCUR
#endif
      do is = 1, nspin
         if(kimg == 1) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ista_kngp, iend_kngp  !for mpi
               tp  = zfsin(i)*vxc_l(i,1,is)*rhpcg_l_3D(i,ipc)
               f1_mpi(1) = f1_mpi(1) + tp*ngabc(i,1)
               f1_mpi(2) = f1_mpi(2) + tp*ngabc(i,2)
               f1_mpi(3) = f1_mpi(3) + tp*ngabc(i,3)
            end do
         else if(kimg == 2) then
!!$#ifdef NEC_TUNE_SMP
!!$!CDIR INNER
!!$#endif
            do i = ista_kngp, iend_kngp  !for mpi
               tp  = (zfsin(i)*vxc_l(i,1,is)+zfcos(i)*vxc_l_3D(i,2,is))&
                    &*rhpcg_l_3D(i,ipc)
               f1_mpi(1) = f1_mpi(1) + tp*ngabc(i,1)
               f1_mpi(2) = f1_mpi(2) + tp*ngabc(i,2)
               f1_mpi(3) = f1_mpi(3) + tp*ngabc(i,3)
            end do
         end if
      end do
!f      call mpi_allreduce(f1_mpi,f1,3 &
!f           &  ,mpi_double_precision,mpi_sum,mpi_comm_group,ierr)
      call mpi_allreduce(f1_mpi,f1,3 &
           &  ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
#ifndef REMOVE_PC_FROM_FORCE
      if(kimg == 1) then
         do i = ista_kngp, iend_kngp  !for mpi
            tp  = zfsin(i)*vxcpc_l_3D(i,1)*rhpcg_l_3D(i,ipc)
            f2_mpi(1) = f2_mpi(1) + tp*ngabc(i,1)
            f2_mpi(2) = f2_mpi(2) + tp*ngabc(i,2)
            f2_mpi(3) = f2_mpi(3) + tp*ngabc(i,3)
         end do
      else if(kimg == 2) then
         do i = ista_kngp, iend_kngp  !for mpi
            tp = (zfsin(i)*vxcpc_l_3D(i,1)+zfcos(i)*vxcpc_l_3D(i,2))*rhpcg_l_3D(i,ipc)
            f2_mpi(1) = f2_mpi(1) + tp*ngabc(i,1)
            f2_mpi(2) = f2_mpi(2) + tp*ngabc(i,2)
            f2_mpi(3) = f2_mpi(3) + tp*ngabc(i,3)
         end do
!xocl end spread
      end if
!f      call mpi_allreduce(f2_mpi,f2,3 &
!f           &  ,mpi_double_precision,mpi_sum,mpi_comm_group,ierr)
      call mpi_allreduce(f2_mpi,f2,3 &
           &  ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
!!$      forcw(ia,1) = forcw(ia,1) + (f1(1)/nspin - f2(1))*univol
!!$      forcw(ia,2) = forcw(ia,2) + (f1(2)/nspin - f2(2))*univol
!!$      forcw(ia,3) = forcw(ia,3) + (f1(3)/nspin - f2(3))*univol
      forcw(ia,1) = forcw(ia,1) + (f1(1) - f2(1))*univol
      forcw(ia,2) = forcw(ia,2) + (f1(2) - f2(2))*univol
      forcw(ia,3) = forcw(ia,3) + (f1(3) - f2(3))*univol
#else
!!$      forcw(ia,1) = forcw(ia,1) + (f1(1)/nspin)*univol
!!$      forcw(ia,2) = forcw(ia,2) + (f1(2)/nspin)*univol
!!$      forcw(ia,3) = forcw(ia,3) + (f1(3)/nspin)*univol
      forcw(ia,1) = forcw(ia,1) + f1(1)*univol
      forcw(ia,2) = forcw(ia,2) + f1(2)*univol
      forcw(ia,3) = forcw(ia,3) + f1(3)*univol
#endif

    end subroutine force_from_Epc_for_each_atom

    subroutine force_from_Elocal_4_each_atom
      integer :: is, i
      real(kind=DP) :: tmp
      real(kind=DP) :: fx,fy,fz
      real(kind=DP), pointer, dimension(:) :: f1_mpi,f2_mpi ! MPI
      fx = 0.d0; fy = 0.d0; fz = 0.d0
      allocate(f1_mpi(3)); allocate(f2_mpi(3))              ! MPI

#ifdef NEC_TUNE_SMP
!CDIR NOCONCUR
#endif
      do is = 1, nspin
         if(kimg == 1) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ista_kngp, iend_kngp  !for mpi
               tmp = zfsin(i)*chgq_l_3D(i,1,is)*psc_l_3D(i,it)
               fx = fx + tmp*ngabc(i,1)
               fy = fy + tmp*ngabc(i,2)
               fz = fz + tmp*ngabc(i,3)
            end do
         else if(kimg == 2) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ista_kngp, iend_kngp  !for mpi
               tmp = (zfsin(i)*chgq_l_3D(i,1,is)+zfcos(i)*chgq_l_3D(i,2,is))&
                    &*psc_l_3D(i,it)
               fx = fx + tmp*ngabc(i,1)
               fy = fy + tmp*ngabc(i,2)
               fz = fz + tmp*ngabc(i,3)
            end do
         end if
      end do
      f1_mpi(1) = fx; f1_mpi(2) = fy; f1_mpi(3) = fz
!f      call mpi_allreduce(f1_mpi,f2_mpi,3 &
!f           &  ,mpi_double_precision,mpi_sum,mpi_comm_group,ierr)
      call mpi_allreduce(f1_mpi,f2_mpi,3 &
           &  ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
      forcw(ia,1:3) = f2_mpi(1:3)*univol
      deallocate(f2_mpi); deallocate(f1_mpi)
    end subroutine force_from_Elocal_4_each_atom
  end subroutine m_Force_term_Elocal_and_Epc_3D

  subroutine m_Force_term_drv_of_vlhxcQ_3D(nfout,pos,napt)
    use m_Parallelization,only: ista_kngp  => ista_kngp_3D      &
                              , iend_kngp  => iend_kngp
    !  ----
    ! (Rev) T. Yamaskai, 31, Aug, 2007
    !     1. 'call set_index_arrays1' that included a bug is replaced
    !       by 'call m_PP_set_index_arrays1', whose bug is fixed.
    !     2. 'call set_index_arrays2' is also replaced by 'call
    !       m_PP_set_index_arrays2' that can be referred from other modules.
    !     3. contained subroutines, set_index_arrays1 and set_index_arrays2 were
    !       deleted.
    integer,intent(in)                           :: nfout
    real(kind=DP), intent(in), dimension(natm,3) :: pos
    integer,intent(in), dimension(natm,nopr+af)  :: napt

    real(kind=DP), allocatable, dimension(:)                  :: ylm_t
    real(kind=DP), allocatable, target, dimension(:,:)        :: ylm_ext
    real(kind=DP), allocatable, dimension(:):: zfcos, zfsin

#ifndef _VECTOR_TUNING_
    real(kind=DP), allocatable, dimension(:,:) :: flhxcq_mpi
    real(kind=DP), allocatable, dimension(:) :: ylm_sum
    integer :: ncache, ibsize, iwidth, ibl1, ibl2, iwidthmax
#endif

#ifndef _m_Force_no_loop_exchange_
    integer :: m, maxm, ip, np, iq
    integer, parameter :: mcritical = 4*2+1
    integer, allocatable, dimension(:) :: nqitg_sp, nqitg_sp0 !d(ntyp)
    integer, allocatable, dimension(:) :: iq2l3 ! d(nqitg)
    integer, allocatable, dimension(:,:) :: nc  ! d(maxm,nqitg)
    integer :: mc ! maxval(nc)
    integer, allocatable, dimension(:,:,:) :: nc2lmt1, nc2lmt2, nc2n ! d(mc,maxm,nqitg)
#endif
    
    integer :: is,it,lmt1,lmt2,n,ia,mdvdb,il1,tau1,il2,tau2,ilm3,l3,iiqitg
    real(kind=DP) :: fx,fy,fz,shdg_x
    integer       :: id_sname = -1
    call tstatc0_begin('m_Force_term_drv_of_vlhxcQ ',id_sname,1)

    call m_PP_find_maximum_l(n)
    n = (n-1) + (n-1) + 1

    if(modnrm == EXECUT) then
       allocate(il3(n**2)); call substitute_il3(n**2,il3) ! -(b_Elec..)
       if(n**2 > nel_Ylm) then
          allocate(ylm_ext(ista_kngp:iend_kngp,nel_Ylm+1:n**2)); ylm_ext = 0.d0
       end if
       allocate(ylm_t(ista_kngp:iend_kngp)); ylm_t = 0.d0
       do ilm3 = nel_Ylm+1, n**2
          call m_pwBS_sphrp2(ilm3,rltv,ista_kngp,iend_kngp,ylm_t)
          ylm_ext(:,ilm3) = ylm_t(:)
       end do
       deallocate(ylm_t)

#ifndef _m_Force_no_loop_exchange_
       allocate(nqitg_sp(ntyp)); allocate(nqitg_sp0(ntyp))
       allocate(iq2l3(nqitg))
       allocate(nc(mcritical,nqitg));nc=0
       call m_PP_set_index_arrays1(nfout,ntyp,nqitg,mcritical,n**2,il3 &
            & ,maxm,mc,nqitg_sp,nqitg_sp0,iq2l3,nc)  ! -> nqitg_sp, nqitg_sp0, iq2l3,nc, maxm, mc
       allocate(nc2lmt1(mc,maxm,nqitg))
       allocate(nc2lmt2(mc,maxm,nqitg))
       allocate(nc2n(mc,maxm,nqitg))
       call m_PP_set_index_arrays2(nfout,mc,maxm,nqitg,mcritical,n**2,il3,iq2l3 &
            & ,nc2lmt1,nc2lmt2,nc2n,nc) ! -> nc2lmt1, nc2lmt2, nc2n, nc
#endif
       
    end if

    flhxcq_l = 0.d0
#ifdef _VECTOR_TUNING_
    allocate(zfcos(ista_kngp:iend_kngp))
    allocate(zfsin(ista_kngp:iend_kngp))
    do it = 1, ntyp
       mdvdb = m_PP_include_vanderbilt_pot(it)
       if(mdvdb == SKIP) cycle

       do ia= 1,natm
          if(ityp(ia) /= it) cycle
          fx = 0.d0; fy = 0.d0; fz = 0.d0
          call calc_phase2(natm,pos,ia,kgp,ngabc,ista_kngp,iend_kngp,zfcos,zfsin)
          do is = 1, nspin, af+1
#ifndef _m_Force_no_loop_exchange_
             do iq = nqitg_sp0(it), nqitg_sp(it)
                l3 = iq2l3(iq)
                do m = 1, 2*l3+1
                   ilm3 = l3*l3+m
                   call sum_hsr_dot_gauntc(is,it,ia,iq,m)
                   if(ilm3 <= nel_Ylm) then
                      ylm => ylm_l(:,ilm3)
                   else
                      ylm => ylm_ext(:,ilm3)
                   end if
                   call add_hardpart_to_flhxcq_l_core(iq,ista_kngp,iend_kngp)
                   !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                   !   --> fx,fy,fz
                end do
             end do
#else
             do lmt1 = 1, ilmt(it)
                il1 = ltp(lmt1,it); tau1 = taup(lmt1,it)
                do lmt2 = lmt1, ilmt(it)
                   il2 = ltp(lmt2,it); tau2 = taup(lmt2,it)
                   do n = 1, il2p(lmt1,lmt2,it)
                      ilm3 = isph(lmt1,lmt2,n,it);    l3   =  il3(ilm3)
                      iiqitg = iqitg(il1,tau1,il2,tau2,l3+1,it)
                      if(iiqitg == 0) cycle
                      call hsr_dot_gaunt(is,it,ia,lmt1,lmt2,n) 
                      if(ilm3 <= nel_Ylm) then
                         ylm => ylm_l(:,ilm3)
                      else
                         ylm => ylm_ext(:,ilm3)
                      end if
                      call add_hardpart_to_flhxcq_l_core(iiqitg,ista_kngp,iend_kngp)
                      !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                      !   --> fx,fy,fz
                   end do ! n
                end do ! lmt2
             end do
#endif
          end do
          call pucv2cart(rltv,fx,fy,fz)
!!$       call fxyz_gsum(fx,fy,fz)
          flhxcq_l(ia,1) = fx; flhxcq_l(ia,2) = fy; flhxcq_l(ia,3) = fz
       end do
    end do
    deallocate(zfsin, zfcos)
#else

#ifndef _m_Force_no_loop_exchange_
    ncache = (m_CtrlP_cachesize()*1024)*3/4
    if(ncache == 0) ibsize = iend_kngp - ista_kngp + 1

    do it = 1, ntyp
       mdvdb = m_PP_include_vanderbilt_pot(it)
       if(mdvdb == SKIP) cycle
       if(ipriforce >= 2) write(nfout,'(" !mForce: it = ",i8)') it
       
       if(ncache/=0) then
          iwidth = nqitg_sp(it) - nqitg_sp0(it) + 1
          if(kimg == 1) then ! qitg_l(i,iq)*ylm(iy)*zfsc1_1(iy)
             ibsize=ncache/(8*(iwidth + 1))
          else ! qitg_l(i,iq),ylm(iy),zfsc1_1(iy),zfsc2_1(iy)
             ibsize=ncache/(8*(iwidth + 2))
          endif
          if(ipriforce >= 2) write(nfout,'(" !mForce: ibsize, iwidth, ncache = ",3i8)') ibsize, iwidth, ncache
       end if

       IBLOCK: do ibl1=ista_kngp,iend_kngp,ibsize
          ibl2=ibl1+ibsize-1
          if(ibl2.gt.iend_kngp) ibl2=iend_kngp
          if(ibl2.gt.kgp) ibl2 = kgp
          allocate(ylm_sum(ibl1:ibl2))
          allocate(zfcos(ibl1:ibl2)); allocate(zfsin(ibl1:ibl2))

       do ia= 1,natm
          if(ityp(ia) /= it) cycle
          fx = 0.d0; fy = 0.d0; fz = 0.d0
          call calc_phase_div(ia)  ! -> zfsin, zfcos
          do is = 1, nspin, af+1
             do iq = nqitg_sp0(it), nqitg_sp(it)
                l3 = iq2l3(iq)
                ylm_sum = 0.d0
                do m = 1, 2*l3+1
                   ilm3 = l3*l3+m
                   call sum_hsr_dot_gauntc(is,it,ia,iq,m)
                   if(ilm3 <= nel_Ylm) then
                      ylm_sum(ibl1:ibl2) = ylm_sum(ibl1:ibl2) + shdg_x*ylm_l(ibl1:ibl2,ilm3)
!!!$                      ylm => ylm_l(:,ilm3)
                   else
                      ylm_sum(ibl1:ibl2) = ylm_sum(ibl1:ibl2) + shdg_x*ylm_ext(ibl1:ibl2,ilm3)
!!!$                      ylm => ylm_ext(:,ilm3)
                   end if
!!!$                   call add_hardpart_to_flhxcq_l_core(iq,ibl1,ibl2)
!!!$                   !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!!!$                   !   --> fx,fy,fz
                end do
                call add_hardpart_to_flhxcq_l_div(iq,ibl1,ibl2)
             end do
          end do
          call pucv2cart(rltv,fx,fy,fz)
          flhxcq_l(ia,1)=flhxcq_l(ia,1)+fx; flhxcq_l(ia,2)=flhxcq_l(ia,2)+fy; flhxcq_l(ia,3)=flhxcq_l(ia,3)+fz
       end do
       deallocate(zfsin,zfcos)
       deallocate(ylm_sum)
       end do IBLOCK
    end do
    if(npes>1) then
       allocate(flhxcq_mpi(natm,3)); flhxcq_mpi = 0.d0
       call mpi_allreduce(flhxcq_l,flhxcq_mpi,3*natm,mpi_double_precision &
            & , mpi_sum, mpi_comm_group,ierr)
       flhxcq_l = flhxcq_mpi
       deallocate(flhxcq_mpi)
    end if
#else
    stop ' you cannot define _m_Force_no_loop_exchange_'
#endif
#endif

!!$!xocl spread do/ind_natm
!!$    do ia = 1, natm
!!$       fx = flhxcq_l(ia,1); fy = flhxcq_l(ia,2); fz = flhxcq_l(ia,3)
!!$       call pucv2cart(rltv,fx,fy,fz)
!!$       flhxcq_l(ia,1) = fx; flhxcq_l(ia,2) = fy; flhxcq_l(ia,3) = fz
!!$    end do
!!$!xocl end spread

    if(af /= 0) then
       allocate(forc_up(natm2,3));allocate(forc_down(natm2,3))
       call forcaf2(natm,natm2,nopr+af,natm,nopr,napt&
            &, iwei,op,ipriforce,af,flhxcq_l,forc_up,forc_down)
       deallocate(forc_up); deallocate(forc_down)
    endif
    if(modnrm == EXECUT) then
       if(allocated(ylm_ext)) deallocate(ylm_ext)
       deallocate(il3)
#ifndef _m_Force_no_loop_exchange_
       deallocate(nc2n,nc2lmt2,nc2lmt1,nc,iq2l3,nqitg_sp,nqitg_sp0)
#endif
    end if

    call tstatc0_end(id_sname)
  contains
#ifndef SX
    subroutine calc_phase_div(ia)
      integer, intent(in) :: ia
      integer :: ig
      real(kind=DP) :: fx,fy,fz, ph
      fx = pos(ia,1)*PAI2
      fy = pos(ia,2)*PAI2
      fz = pos(ia,3)*PAI2
      do ig = ibl1, ibl2
         ph = ngabc(ig,1)*fx + ngabc(ig,2)*fy + ngabc(ig,3)*fz
         zfcos(ig) = dcos(ph)
         zfsin(ig) = dsin(ph)
      end do
    end subroutine calc_phase_div
#endif

#ifndef _m_Force_no_loop_exchange_
    subroutine sum_hsr_dot_gauntc(is,it,ia,iq,m)
      integer, intent(in) :: is,it,ia,iq,m
      integer :: ip, lmt1, lmt2, np
      real(kind=DP) :: fac
      shdg_x = 0.d0
      do ip = 1, nc(m,iq)
         lmt1 = nc2lmt1(ip,m,iq)
         lmt2 = nc2lmt2(ip,m,iq)
         np = nc2n(ip,m,iq)
         fac = 2.d0; if(lmt1 == lmt2) fac = 1.d0
         shdg_x = shdg_x + &
              & fac*hsr(ia,lmt1,lmt2,is)*dl2p(lmt1,lmt2,np,it)
      end do
    end subroutine sum_hsr_dot_gauntc
#else
    subroutine hsr_dot_gaunt(is,it,ia,lmt1,lmt2,n)
      integer, intent(in) :: is, it, ia, lmt1,lmt2,n
      real(kind=DP) :: fac
      
      fac = 2.d0; if(lmt1 == lmt2) fac = 1.d0
      shdg_x = fac*hsr(ia,lmt1,lmt2,is)*dl2p(lmt1,lmt2,n,it)
    end subroutine hsr_dot_gaunt

#endif
    subroutine add_hardpart_to_flhxcq_l_core(iq,ibl1,ibl2)
      integer, intent(in) :: iq,ibl1,ibl2
      integer       :: i, iy
      real(kind=DP) :: flchgq, f, tx,ty,tz, f2
      real(kind=DP),pointer,dimension(:) :: f1_mpi, f2_mpi
!!$      real(kind=DP),pointer,dimension(:) :: zfsc1, zfsc2

!!$      if(mod(l3,2) == 0) then
!!$         flchgq = fac*real(zi**(-l3))*dga*hsr(ia,lmt1,lmt2,is)
!!$         flchgq = real(zi**(-l3))*shdg_x
!!$         if(kimg == 1) then
!!$            zfsc1 => zfsin
!!$         else
!!$            f2 = 1;  zfsc1 => zfsin; zfsc2 => zfcos
!!$         end if
!!$      else
!!$         flchgq = - aimag(zi**(-l3))*shdg_x
!!$         if(kimg == 1) then
!!$            zfsc1 => zfcos
!!$         else
!!$            f2 = -1; zfsc1 => zfcos; zfsc2 => zfsin
!!$         end if
!!$      end if

      tx = 0.d0; ty = 0.d0; tz = 0.d0
      if(mod(l3,2) == 0) then
         flchgq = real(zi**(-l3))*shdg_x
         if(kimg == 1) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ibl1, ibl2  !for mpi
               iy = i-ista_kngp+1
!!$               f  = qitg_l(i,iq)*ylm(iy)*vlhxc_l(i,1,is)*zfsc1(iy)
               f  = qitg_l_3D(i,iq)*ylm(iy)*vlhxc_l_3D(i,1,is)*zfsin(i)
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         else if(kimg == 2) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ibl1, ibl2  !for mpi
               iy = i-ista_kngp+1
               f = qitg_l_3D(i,iq)*ylm(iy)*&
                    &(vlhxc_l_3D(i,1,is)*zfsin(i)+vlhxc_l_3D(i,2,is)*zfcos(i))
!!$                    &(vlhxc_l(i,1,is)*zfsc1(iy)+vlhxc_l(i,2,is)*zfsc2(iy))
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         end if
      else
         flchgq = - aimag(zi**(-l3))*shdg_x
         if(kimg == 1) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ibl1, ibl2  !for mpi
               iy = i-ista_kngp+1
!!$               f  = qitg_l(i,iq)*ylm(iy)*vlhxc_l(i,1,is)*zfsc1(iy)
               f  = qitg_l_3D(i,iq)*ylm(iy)*vlhxc_l_3D(i,1,is)*zfcos(i)
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         else if(kimg == 2) then
#ifdef NEC_TUNE_SMP
!CDIR INNER
#endif
            do i = ibl1, ibl2  !for mpi
               iy = i-ista_kngp+1
               f = qitg_l_3D(i,iq)*ylm(iy)*&
                    &(vlhxc_l_3D(i,1,is)*zfcos(i)-vlhxc_l_3D(i,2,is)*zfsin(i))
!!$                    &(vlhxc_l(i,1,is)*zfsc1(iy)-vlhxc_l(i,2,is)*zfsc2(iy))
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         end if
      end if

      if(npes > 1) then
         allocate(f1_mpi(3)); allocate(f2_mpi(3))
         f1_mpi(1) = tx; f1_mpi(2) = ty; f1_mpi(3) = tz  ! MPI
!f         call mpi_allreduce(f1_mpi,f2_mpi,3 &
!f              &  ,mpi_double_precision,mpi_sum,mpi_comm_group,ierr) ! MPI
         call mpi_allreduce(f1_mpi,f2_mpi,3 &
              &  ,mpi_double_precision,mpi_sum,mpi_ke_world,ierr) ! MPI
         tx = f2_mpi(1); ty = f2_mpi(2); tz = f2_mpi(3)
         deallocate(f2_mpi); deallocate(f1_mpi)
      end if
      fx = fx + flchgq*tx*univol
      fy = fy + flchgq*ty*univol
      fz = fz + flchgq*tz*univol

    end subroutine add_hardpart_to_flhxcq_l_core

#ifndef SX
    subroutine add_hardpart_to_flhxcq_l_div(iq,ibl1,ibl2)
      integer, intent(in) :: iq,ibl1,ibl2
      integer       :: i
      real(kind=DP) :: flchgq, f, tx,ty,tz, f2
      real(kind=DP),pointer,dimension(:) :: f1_mpi, f2_mpi

      tx = 0.d0; ty = 0.d0; tz = 0.d0
      if(mod(l3,2) == 0) then
         flchgq = real(zi**(-l3))
         if(kimg == 1) then
            do i = ibl1, ibl2  !for mpi
               f  = qitg_l_3D(i,iq)*ylm_sum(i)*vlhxc_l_3D(i,1,is)*zfsin(i)
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         else if(kimg == 2) then
            do i = ibl1, ibl2  !for mpi
               f = qitg_l_3D(i,iq)*ylm_sum(i)*&
                    &(vlhxc_l_3D(i,1,is)*zfsin(i)+vlhxc_l_3D(i,2,is)*zfcos(i))
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         end if
      else
         flchgq = - aimag(zi**(-l3))
         if(kimg == 1) then
            do i = ibl1, ibl2  !for mpi
               f  = qitg_l_3D(i,iq)*ylm_sum(i)*vlhxc_l_3D(i,1,is)*zfcos(i)
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         else if(kimg == 2) then
            do i = ibl1, ibl2  !for mpi
               f = qitg_l_3D(i,iq)*ylm_sum(i)*&
                    &(vlhxc_l_3D(i,1,is)*zfcos(i)-vlhxc_l_3D(i,2,is)*zfsin(i))
               tx = tx + f*ngabc(i,1);  ty = ty + f*ngabc(i,2)
               tz = tz + f*ngabc(i,3)
            end do
         end if
      end if

!!$      if(npes > 1) then
!!$         allocate(f1_mpi(3)); allocate(f2_mpi(3))
!!$         f1_mpi(1) = tx; f1_mpi(2) = ty; f1_mpi(3) = tz  ! MPI
!!$         call mpi_allreduce(f1_mpi,f2_mpi,3 &
!!$              &  ,mpi_double_precision,mpi_sum,mpi_comm_group,ierr) ! MPI
!!$         tx = f2_mpi(1); ty = f2_mpi(2); tz = f2_mpi(3)
!!$         deallocate(f2_mpi); deallocate(f1_mpi)
!!$      end if
      fx = fx + flchgq*tx*univol
      fy = fy + flchgq*ty*univol
      fz = fz + flchgq*tz*univol

    end subroutine add_hardpart_to_flhxcq_l_div
#endif
  end subroutine m_Force_term_drv_of_vlhxcQ_3D

#ifdef FORCE_DGEMM
! <-- T.Kokubo & D.Fukata, Feb. 2010
  subroutine m_Force_term_drv_of_flmt_3D(kv3,pos,napt)
    integer, intent(in)                          :: kv3
    real(kind=DP), intent(in), dimension(natm,3) :: pos
    integer,intent(in), dimension(natm,nopr+af)     :: napt

    integer       :: id_sname = -1
    integer :: ibsize,ibl1,ibl2, is,ik,iksnl,ia,it,mdvdb,aoffset
    real(kind=DP) :: fac, fx, fy, fz

    integer, allocatable, dimension(:)       :: mil_tmp
    integer, allocatable, dimension(:,:)     :: ngabc_red
    real(kind=DP),allocatable,dimension(:,:) :: zfsin,  zfcos   ! MPI
    real(kind=DP),allocatable,dimension(:,:,:) :: ar_tmp, ai_tmp  ! MPI
    real(kind=DP),allocatable,dimension(:,:) :: fnlxyz_mpi
!f#ifdef PARA3D
    integer :: icnt
    real(kind=DP),allocatable,dimension(:,:,:)   :: work1
    real(kind=DP),allocatable,dimension(:,:)     :: occup_l_3D
!f#endif

      call tstatc0_begin('m_Force_term_drv_of_flmt ',id_sname,1)
!f#ifdef PARA3D
      allocate(    gr(np_e_3D,nlmta,3) )    ! MPI
      allocate(    gi(np_e_3D,nlmta,3) )    ! MPI
      allocate( work1(np_e_3D,nlmta,3) )    ! MPI
      allocate(occup_l_3D(np_e_3D,ista_k_3D:iend_k_3D)) ; occup_l_3D = 0.d0
!f#else
!f      allocate( gr(np_e,nlmta,3) )    ! MPI
!f      allocate( gi(np_e,nlmta,3) )    ! MPI
!f#endif
      fac = 2.d0/kv3
      fnlxyz_l = 0.d0

      if(nblocksize_force_is_given) then
         ibsize = nblocksize_force
      else
         ibsize= nb_force_default
      end if
      write(6,'(" ! ibsize = ",i8," <<m_Force_term_drv_of_flmt>>")') ibsize
      allocate(   mil_tmp(nlmta)    );   mil_tmp = 0
      allocate( ngabc_red(ibsize,3) ); ngabc_red = 0
      allocate(     zfcos(ibsize,natm),      zfsin(ibsize,natm)    )
      allocate(    ar_tmp(ibsize,nlmta,3),  ai_tmp(ibsize,nlmta,3) )

!f#ifdef PARA3D
! ==============================================================================
!$    do is = 1, nspin, af+1
!$       do ik = is, kv3-nspin+is, nspin
!$          if(map_k(ik) /= myrank_k) cycle
!$          call decomp_zaj_l_3D_ik(zaj_l,zaj_l_3D,ik,nrvf_ordr,"    ")
!$          call decomp_fsr_l_3D_ik(fsr_l,fsr_l_3D,ik,nrvf_ordr,"    ")
!$          if(.not.(kv3/nspin == 1 .and. k_symmetry(1) == GAMMA .and. kimg == 2)) then
!$             call decomp_fsr_l_3D_ik(fsi_l,fsi_l_3D,ik,nrvf_ordr,"    ")
!$          endif
!$          call decomp_eko_l_3D_new(eko_l,eko_l_3D,ik,nrvf_ordr,"    ")
!$          call decomp_eko_l_3D_new(occup_l,occup_l_3D,ik,nrvf_ordr,"    ")
!$       enddo
!$    enddo
! ==============================================================================
      do is = 1, nspin, af+1
         do ik = is, kv3+is-nspin,nspin
           if(map_k_3D(ik) /= myrank_k_3D) cycle
!f            if(map_k(ik) /= myrank_k) cycle
            iksnl = (ik-1)/nspin+1
! === fs[ri]_3D -> fs[ri]_2D =============================================================
            if(.not. allocated(fsr_l_2D)) allocate(fsr_l_2D(np_e_3D,nlmta)); fsr_l_2D =0.0d0
            if( k_symmetry(ik) /= GAMMA ) then
               if(.not. allocated(fsi_l_2D)) allocate(fsi_l_2D(np_e_3D,nlmta)); fsi_l_2D =0.0d0
            endif
            call gather_f_3d_to_2d(fsr_l_3D, fsr_l_2D)
            if( k_symmetry(ik) /= GAMMA ) then
               call gather_f_3d_to_2d(fsi_l_3D, fsi_l_2D)
            endif
! ========================================================================================

            do ibl1 = ista_g1k_3D(ik), iend_g1k_3D(ik), ibsize
               ibl2 = ibl1+ibsize-1
               if(ibl2 .gt. iend_g1k_3D(ik) ) ibl2 = iend_g1k_3D(ik)
               gr=0.d0
               gi=0.d0

               call substitute_ngabcred(ik)         ! ngabc,nbase                   --> ngabc_red
               call calc_phase_div()                ! pos, ngabc_red                --> zfcos, zfsin
               call pre_drv_of_betar_dot_WFs_div()  ! zfsin,zfcos,snl               --> ar_tmp,ai_tmp
               call drv_of_betar_dot_WFs_div(gr,gi) ! ar_tmp,ai_tmp,zaj_l,ngabc_red --> gr,gi

               icnt = np_e_3D*nlmta*3
!              call mpi_allreduce(gr,work1,icnt,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_gg_world,ierr)
               call mpi_allreduce(gr,work1,icnt,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
               gr(1:np_e_3D,1:nlmta,1:3)=work1(1:np_e_3D,1:nlmta,1:3)
!              call mpi_allreduce(gi,work1,icnt,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_gg_world,ierr)
               call mpi_allreduce(gi,work1,icnt,MPI_DOUBLE_PRECISION,MPI_SUM,mpi_ke_world,ierr)
               gi(1:np_e_3D,1:nlmta,1:3)=work1(1:np_e_3D,1:nlmta,1:3)

               aoffset = 0
               do ia = 1, natm
                  it = ityp(ia)
                  fx = 0.d0; fy = 0.d0; fz = 0.d0
                  mdvdb = m_PP_include_vanderbilt_pot(it)
                  if(mdvdb == EXECUT) then
                     call term_related_to_drv_of_flmt_VDB ! -> fx,fy,fz
                  else if(mdvdb == SKIP) then
                     call term_related_to_drv_of_flmt_NC  ! -> fx,fy,fz
                  end if
                  aoffset = aoffset + ilmt(it)
                  call pucv2cart(rltv,fx,fy,fz)
                  fnlxyz_l(ia,1) = fnlxyz_l(ia,1) + fx
                  fnlxyz_l(ia,2) = fnlxyz_l(ia,2) + fy
                  fnlxyz_l(ia,3) = fnlxyz_l(ia,3) + fz
               end do
            end do
         end do
      end do
!f#else
!f      do is = 1, nspin, af+1
!f         do ik = is, kv3+is-nspin,nspin
!f            if(map_k(ik) /= myrank_k) cycle
!f            iksnl = (ik-1)/nspin+1
!f
!f            do ibl1 = 1, iba(ik), ibsize
!f               ibl2 = ibl1+ibsize-1
!f               if(ibl2 .gt. iba(ik)) ibl2 = iba(ik)
!f               gr=0.d0
!f               gi=0.d0
!f
!f               call substitute_ngabcred(ik)         ! ngabc,nbase                   --> ngabc_red
!f               call calc_phase_div()                ! pos, ngabc_red                --> zfcos, zfsin
!f               call pre_drv_of_betar_dot_WFs_div()  ! zfsin,zfcos,snl               --> ar_tmp,ai_tmp
!f               call drv_of_betar_dot_WFs_div(gr,gi) ! ar_tmp,ai_tmp,zaj_l,ngabc_red --> gr,gi
!f
!f               aoffset = 0
!f               do ia = 1, natm
!f                  it = ityp(ia)
!f                  fx = 0.d0; fy = 0.d0; fz = 0.d0
!f                  mdvdb = m_PP_include_vanderbilt_pot(it)
!f                  if(mdvdb == EXECUT) then
!f                     call term_related_to_drv_of_flmt_VDB ! -> fx,fy,fz
!f                  else if(mdvdb == SKIP) then
!f                     call term_related_to_drv_of_flmt_NC  ! -> fx,fy,fz
!f                  end if
!f                  aoffset = aoffset + ilmt(it)
!f                  call pucv2cart(rltv,fx,fy,fz)
!f                  fnlxyz_l(ia,1) = fnlxyz_l(ia,1) + fx
!f                  fnlxyz_l(ia,2) = fnlxyz_l(ia,2) + fy
!f                  fnlxyz_l(ia,3) = fnlxyz_l(ia,3) + fz
!f                end do
!f            end do
!f         end do
!f      end do
!f#endif

      deallocate( mil_tmp, ngabc_red, zfcos, zfsin, ar_tmp, ai_tmp )

      if(npes>1) then
         allocate(fnlxyz_mpi(natm,3)); fnlxyz_mpi = 0.d0
!f#ifdef PARA3D
! === Is MPI_Allreduce on k-direction necessary??? by tkato Feb. 25th ===
         call mpi_allreduce(fnlxyz_l,fnlxyz_mpi,3*natm,mpi_double_precision &
             & , mpi_sum, mpi_kg_world,ierr)
!f#else
!f         call mpi_allreduce(fnlxyz_l,fnlxyz_mpi,3*natm,mpi_double_precision &
!f             & , mpi_sum, mpi_comm_group,ierr)
!f#endif
         fnlxyz_l = fnlxyz_mpi
      end if
      if(npes>1)deallocate(fnlxyz_mpi)

      if(af /= 0) then
         allocate(forc_up(natm2,3));allocate(forc_down(natm2,3))
         call forcaf2(natm,natm2,nopr+af,natm,nopr,napt &
           & ,iwei,op,ipriforce,af,fnlxyz_l ,forc_up,forc_down)
         deallocate(forc_down);deallocate(forc_up)
      end if

!f#ifdef PARA3D
      deallocate( gi,gr,work1 )
      deallocate( occup_l_3D )
!f#else
!f      deallocate(gi); deallocate(gr)
!f#endif

      if(ipriforce >= 2)  then
         write(6,'(" --- fnlxyz_l ->>")')
         do ia = 1, natm
            write(6,'(i8,3f20.8)') ia, fnlxyz_l(ia,1:3)
         end do
         write(6,'(" <<- fnlxyz_l ---")')
      end if

      call tstatc0_end(id_sname)

    contains
!f#ifdef PARA3D
! === fs[ri]_3D -> fs[ri]_2D =============================================================
    subroutine gather_f_3d_to_2d(inn,out)
      use m_Parallelization,only : np_e    => np_e_3D    &
                                 , ista_fs => ista_fs_3D , iend_fs => iend_fs_3D
      real(kind=DP), dimension(:,:,:), intent(in)  :: inn
      real(kind=DP), dimension(:,:)  , intent(out) :: out
      real(kind=DP), dimension(:,:)  , allocatable :: send
      integer(kind=4) :: ierr , i, j

      allocate(send(np_e,nlmta),stat=ierr)
      if(ierr /= 0) then
         print *,'Error Not allocated'
         stop
      endif

      out  = 0.0d0
      send = 0.0d0
      do j = ista_fs, iend_fs
         do i = 1, np_e
            send(i,j) = inn(i,j-ista_fs+1,ik)
         enddo
      enddo

      call mpi_allreduce(send,out,np_e*nlmta,MPI_DOUBLE_PRECISION,MPI_SUM, &
     &                   mpi_ke_world,ierr)

      deallocate(send)
    end subroutine gather_f_3d_to_2d
! ========================================================================================
!f#endif
      subroutine term_related_to_drv_of_flmt_NC
        integer :: lmt1, lmta1, lmt2, lmta2, l1,m1,l2,m2, i, lmt11, lmt22
        real(kind=DP) :: fac_n, a, a_n, fr1,fi1,fr2,fi2

        do lmt1 = 1, ilmt(it)
           lmt11 = lmt1 + aoffset
           lmta1 = lmta(lmt1,ia); l1 = ltp(lmt1,it); m1 = mtp(lmt1,it)
           do lmt2 = lmt1, ilmt(it)
               lmt22 = lmt2 + aoffset
              l2 = ltp(lmt2,it); m2 = mtp(lmt2,it); lmta2 = lmta(lmt2,ia)
              if(l1 /= l2 .or. m1 /= m2) cycle
              fac_n = 2*fac; if(lmt1 == lmt2) fac_n = fac
              a = dion(lmt1,lmt2,it)
              if(k_symmetry(ik) == GAMMA) then
!f#ifdef PARA3D
                 do i = 1, np_e_3D                              ! MPI
                    a_n = fac_n * a *occup_l_3D(i,ik)
!                   fr1 = fsr_l_3D(i,lmta1,ik); fr2 = fsr_l_3D(i,lmta2,ik)
                    fr1 = fsr_l_2D(i,lmta1); fr2 = fsr_l_2D(i,lmta2)
!f#else
!f                 do i = 1, np_e                                 ! MPI
!f                    a_n = fac_n * a *occup_l(i,ik)
                    fr1 = fsr_l(i,lmta1,ik); fr2 = fsr_l(i,lmta2,ik)
!f#endif
                    fx  = fx - a_n * (gr(i,lmt11,1)*fr2 + fr1*gr(i,lmt22,1))
                    fy  = fy - a_n * (gr(i,lmt11,2)*fr2 + fr1*gr(i,lmt22,2))
                    fz  = fz - a_n * (gr(i,lmt11,3)*fr2 + fr1*gr(i,lmt22,3))
                 end do
              else
!f#ifdef PARA3D
                 do i = 1, np_e_3D                              ! MPI
                    a_n = fac_n * a *occup_l_3D(i,ik)
!                   fr1 = fsr_l_3D(i,lmta1,ik); fr2 = fsr_l_3D(i,lmta2,ik)
                    fr1 = fsr_l_2D(i,lmta1); fr2 = fsr_l_2D(i,lmta2)
!                   fi1 = fsi_l_3D(i,lmta1,ik); fi2 = fsi_l_3D(i,lmta2,ik)
                    fi1 = fsi_l_2D(i,lmta1); fi2 = fsi_l_2D(i,lmta2)
!f#else
!f                 do i = 1, np_e                                 ! MPI
!f                    a_n = fac_n * a *occup_l(i,ik)
!f                    fr1 = fsr_l(i,lmta1,ik); fr2 = fsr_l(i,lmta2,ik)
!f                    fi1 = fsi_l(i,lmta1,ik); fi2 = fsi_l(i,lmta2,ik)
!f#endif
                    fx  = fx - a_n * (gr(i,lmt11,1)*fr2+gi(i,lmt11,1)*fi2&
             +fr1*gr(i,lmt22,1)+fi1*gi(i,lmt22,1))
                    fy  = fy - a_n * (gr(i,lmt11,2)*fr2+gi(i,lmt11,2)*fi2&
             +fr1*gr(i,lmt22,2)+fi1*gi(i,lmt22,2))
                    fz  = fz - a_n * (gr(i,lmt11,3)*fr2+gi(i,lmt11,3)*fi2&
             +fr1*gr(i,lmt22,3)+fi1*gi(i,lmt22,3))
                    end do
                 end if
           end do
        end do
      end subroutine term_related_to_drv_of_flmt_NC

      subroutine term_related_to_drv_of_flmt_VDB
        integer :: lmt1, lmta1, lmt2, lmta2, i, lmt11, lmt22
        real(kind=DP) :: fac_n, a, a_n, fr1,fi1,fr2,fi2
        integer       :: id_sname = -1
        call tstatc0_begin('term_related_to_drv_of_flmt_VDB ',id_sname)

        do lmt1 = 1, ilmt(it)
           lmt11 = lmt1 + aoffset
           lmta1 = lmta(lmt1,ia)
           do lmt2 = lmt1, ilmt(it)
              lmt22 = lmt2 + aoffset
              lmta2 = lmta(lmt2,ia)
              fac_n = 2*fac; if(lmt1 == lmt2) fac_n = fac
              if(ipaw(it)==0) then
                  a = vlhxcq(lmt1,lmt2,ia,is)+dion(lmt1,lmt2,it)
              else
                  a = vlhxcq(lmt1,lmt2,ia,is)+dion_paw(lmt1,lmt2,is,ia)
              end if
              if(k_symmetry(ik) == GAMMA) then
!f#ifdef PARA3D
                 do i = 1, np_e_3D                              ! MPI
                    a_n = fac_n*(a - eko_l_3D(i,ik)*q(lmt1,lmt2,it))*occup_l_3D(i,ik)
!                   fr1 = fsr_l_3D(i,lmta1,ik); fr2 = fsr_l_3D(i,lmta2,ik)
                    fr1 = fsr_l_2D(i,lmta1); fr2 = fsr_l_2D(i,lmta2)
!f#else
!f                 do i = 1, np_e                                 ! MPI
!f                    a_n = fac_n*(a - eko_l(i,ik)*q(lmt1,lmt2,it))*occup_l(i,ik)
!f                    fr1 = fsr_l(i,lmta1,ik); fr2 = fsr_l(i,lmta2,ik)
!f#endif
                    fx  = fx - a_n * (gr(i,lmt11,1)*fr2 + fr1*gr(i,lmt22,1))
                    fy  = fy - a_n * (gr(i,lmt11,2)*fr2 + fr1*gr(i,lmt22,2))
                    fz  = fz - a_n * (gr(i,lmt11,3)*fr2 + fr1*gr(i,lmt22,3))
                 end do
              else
!f#ifdef PARA3D
                 do i = 1, np_e_3D                              ! MPI
                    a_n = fac_n*(a - eko_l_3D(i,ik)*q(lmt1,lmt2,it))*occup_l_3D(i,ik)
!                   fr1 = fsr_l_3D(i,lmta1,ik); fr2 = fsr_l_3D(i,lmta2,ik)
                    fr1 = fsr_l_2D(i,lmta1); fr2 = fsr_l_2D(i,lmta2)
!                   fi1 = fsi_l_3D(i,lmta1,ik); fi2 = fsi_l_3D(i,lmta2,ik)
                    fi1 = fsi_l_2D(i,lmta1); fi2 = fsi_l_2D(i,lmta2)
!f#else
!f                 do i = 1, np_e                                 ! MPI
!f                    a_n = fac_n*(a - eko_l(i,ik)*q(lmt1,lmt2,it))*occup_l(i,ik)
!f                    fr1 = fsr_l(i,lmta1,ik); fr2 = fsr_l(i,lmta2,ik)
!f                    fi1 = fsi_l(i,lmta1,ik); fi2 = fsi_l(i,lmta2,ik)
!f#endif
                    fx  = fx - a_n * (gr(i,lmt11,1)*fr2+gi(i,lmt11,1)*fi2&
             +fr1*gr(i,lmt22,1)+fi1*gi(i,lmt22,1))
                    fy  = fy - a_n * (gr(i,lmt11,2)*fr2+gi(i,lmt11,2)*fi2&
             +fr1*gr(i,lmt22,2)+fi1*gi(i,lmt22,2))
                    fz  = fz - a_n * (gr(i,lmt11,3)*fr2+gi(i,lmt11,3)*fi2&
             +fr1*gr(i,lmt22,3)+fi1*gi(i,lmt22,3))
                 end do
              end if
           end do
        end do
        call tstatc0_end(id_sname)
      end subroutine term_related_to_drv_of_flmt_VDB

      subroutine substitute_ngabcred(ik)
        integer, intent(in) :: ik
        integer :: ig, i1
        if(ibl2-ibl1+1 > ibsize) stop ' ibl2-ibl1+1 > ibsize'
           do ig = 1, ibl2-ibl1+1
              i1 = nbase(ig+ibl1-1,ik)
              ngabc_red(ig,1) = ngabc(i1,1)
              ngabc_red(ig,2) = ngabc(i1,2)
              ngabc_red(ig,3) = ngabc(i1,3)
           end do
      end subroutine substitute_ngabcred

      subroutine calc_phase_div()
          integer :: ia, ig
          real(kind=DP) :: fx,fy,fz, ph

          do ia = 1, natm
             fx = pos(ia,1)*PAI2
             fy = pos(ia,2)*PAI2
             fz = pos(ia,3)*PAI2
             do ig = 1, ibl2-ibl1+1
                ph = ngabc_red(ig,1)*fx + ngabc_red(ig,2)*fy + ngabc_red(ig,3)*fz
                zfcos(ig,ia) = dcos(ph)
                zfsin(ig,ia) = dsin(ph)
             end do
          end do
      end subroutine calc_phase_div

      subroutine pre_drv_of_betar_dot_WFs_div()
          integer :: ia, i, t, lmt1, lmtt1, icnt, mil, fac_tmp

          icnt = 0
          if(k_symmetry(ik) == GAMMA) then
             if(kimg == 1) then   ! only ar
               do ia = 1, natm
                  it = ityp(ia)
                  do lmt1 = 1, ilmt(it)
                     icnt = icnt + 1
                     lmtt1 = lmtt(lmt1,it)
                     do i = ibl1, ibl2
                        ar_tmp(i-ibl1+1,icnt,1) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                        ar_tmp(i-ibl1+1,icnt,2) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                        ar_tmp(i-ibl1+1,icnt,3) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                     end do
                  enddo
               enddo
             else if(kimg == 2) then
               do ia = 1, natm
                  it = ityp(ia)
                  do lmt1 = 1, ilmt(it)
                     icnt = icnt + 1
                     lmtt1 = lmtt(lmt1,it)
                     mil = mod( ltp(lmt1,it),4 )
                     if( mil.eq.1 .or. mil.eq.2 ) then
                       fac_tmp= -2.d0
                     else if( mil.eq.0 .or. mil.eq.3 ) then
                       fac_tmp=  2.d0
                     endif

                     if( mil.eq.1 .or. mil.eq.3 ) then
                         do i = ibl1, ibl2
                            ar_tmp(i-ibl1+1,icnt,1) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                            ar_tmp(i-ibl1+1,icnt,2) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                            ar_tmp(i-ibl1+1,icnt,3) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                            ai_tmp(i-ibl1+1,icnt,1) =  fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                            ai_tmp(i-ibl1+1,icnt,2) =  fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                            ai_tmp(i-ibl1+1,icnt,3) =  fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                         end do
                     else if( mil.eq.0 .or. mil.eq.2 ) then
                         do i = ibl1, ibl2
                            ar_tmp(i-ibl1+1,icnt,1) = -fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                            ar_tmp(i-ibl1+1,icnt,2) = -fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                            ar_tmp(i-ibl1+1,icnt,3) = -fac_tmp*zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                            ai_tmp(i-ibl1+1,icnt,1) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                            ai_tmp(i-ibl1+1,icnt,2) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                            ai_tmp(i-ibl1+1,icnt,3) =  fac_tmp*zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                         end do
                     endif
                  enddo
                enddo
             endif
          else
               do ia = 1, natm
                  it = ityp(ia)
                  do lmt1 = 1, ilmt(it)
                     icnt = icnt + 1
                     lmtt1 = lmtt(lmt1,it)
                     mil_tmp(icnt) = mod( ltp(lmt1,it),4 )
                      do i = ibl1, ibl2
                         ar_tmp(i-ibl1+1,icnt,1) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                         ar_tmp(i-ibl1+1,icnt,2) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                         ar_tmp(i-ibl1+1,icnt,3) = zfcos(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                         ai_tmp(i-ibl1+1,icnt,1) = zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,1)
                         ai_tmp(i-ibl1+1,icnt,2) = zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,2)
                         ai_tmp(i-ibl1+1,icnt,3) = zfsin(i-ibl1+1,ia)*snl(i,lmtt1,iksnl)*ngabc_red(i-ibl1+1,3)
                      end do
                  enddo
                enddo
          endif
      end subroutine pre_drv_of_betar_dot_WFs_div

      subroutine drv_of_betar_dot_WFs_div(gr,gi)
        real(kind=DP),intent(inout), dimension(np_e_3D,nlmta,3) :: gr,gi
        integer :: lmt1, ib, i, mil, ibl1_2, nblk, nblk2
        real(kind=DP) :: br, bi, tmp1, tmp2, tmp3
        integer :: id_sname = -1
        call tstatc0_begin('drv_of_betar_dot_WFs_div ',id_sname)

         if(k_symmetry(ik) == GAMMA) then
            if(kimg == 1) then
               nblk=ibl2-ibl1+1
               nblk2=ibl1-ista_g1k_3D(ik)+1
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,1),ibsize, &
              &           0.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,2),ibsize, &
              &           0.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,3),ibsize, &
              &           0.d0,gr(1,1,3),np_e_3D)
            else if(kimg == 2) then
               ibl1_2 = max(2,ibl1)
               nblk =ibl2-ibl1_2+1
               nblk2=ibl1_2-ista_g1k_3D(ik)+1
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(ibl1_2-ibl1+1,1,1), &
              &           ibsize,0.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(ibl1_2-ibl1+1,1,1), &
              &           ibsize,1.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(ibl1_2-ibl1+1,1,2), &
              &           ibsize,0.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(ibl1_2-ibl1+1,1,2), &
              &           ibsize,1.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(ibl1_2-ibl1+1,1,3), &
              &           ibsize,0.d0,gr(1,1,3),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(ibl1_2-ibl1+1,1,3), &
              &           ibsize,1.d0,gr(1,1,3),np_e_3D)
            endif
         else
            if(kimg == 1) then
               nblk =ibl2-ibl1+1
               nblk2=ibl1-ista_g1k_3D(ik)+1
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,1),ibsize, &
              &           0.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,2),ibsize, &
              &           0.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,3),ibsize, &
              &           0.d0,gr(1,1,3),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,1),ibsize, &
              &           0.d0,gi(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,2),ibsize, &
              &           0.d0,gi(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,3),ibsize, &
              &           0.d0,gi(1,1,3),np_e_3D)
            else if(kimg == 2) then
               nblk =ibl2-ibl1+1
               nblk2=ibl1-ista_g1k_3D(ik)+1
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,1),ibsize, &
              &           0.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk,-1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ai_tmp(1,1,1),ibsize, &
              &           1.d0,gr(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,2),ibsize, &
              &           0.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk,-1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ai_tmp(1,1,2),ibsize, &
              &           1.d0,gr(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ar_tmp(1,1,3),ibsize, &
              &           0.d0,gr(1,1,3),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk,-1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ai_tmp(1,1,3),ibsize, &
              &           1.d0,gr(1,1,3),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,1),ibsize, &
              &           0.d0,gi(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(1,1,1),ibsize, &
              &           1.d0,gi(1,1,1),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,2),ibsize, &
              &           0.d0,gi(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(1,1,2),ibsize, &
              &           1.d0,gi(1,1,2),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,1),maxval(np_g1k_3D),ai_tmp(1,1,3),ibsize, &
              &           0.d0,gi(1,1,3),np_e_3D)
               call DGEMM('T','N',np_e_3D,nlmta,nblk, 1.d0,zaj_l_3D(nblk2,1,ik,2),maxval(np_g1k_3D),ar_tmp(1,1,3),ibsize, &
              &           1.d0,gi(1,1,3),np_e_3D)
            end if
            do lmt1 = 1, nlmta
               mil = mil_tmp(lmt1)
               if( mil == 1 ) then
                  do ib = 1, np_e_3D
                     tmp1 = gr(ib,lmt1,1)
                     tmp2 = gr(ib,lmt1,2)
                     tmp3 = gr(ib,lmt1,3)
                     gr(ib,lmt1,1) = -gi(ib,lmt1,1)
                     gr(ib,lmt1,2) = -gi(ib,lmt1,2)
                     gr(ib,lmt1,3) = -gi(ib,lmt1,3)
                     gi(ib,lmt1,1) = tmp1
                     gi(ib,lmt1,2) = tmp2
                     gi(ib,lmt1,3) = tmp3
                  end do
               elseif (mil == 2 ) then
                  do ib = 1, np_e_3D
                     gr(ib,lmt1,1) = - gr(ib,lmt1,1)
                     gr(ib,lmt1,2) = - gr(ib,lmt1,2)
                     gr(ib,lmt1,3) = - gr(ib,lmt1,3)
                     gi(ib,lmt1,1) = - gi(ib,lmt1,1)
                     gi(ib,lmt1,2) = - gi(ib,lmt1,2)
                     gi(ib,lmt1,3) = - gi(ib,lmt1,3)
                  end do
               elseif (mil == 3 ) then
                  do ib = 1, np_e_3D
                     tmp1 = gr(ib,lmt1,1)
                     tmp2 = gr(ib,lmt1,2)
                     tmp3 = gr(ib,lmt1,3)
                     gr(ib,lmt1,1) = gi(ib,lmt1,1)
                     gr(ib,lmt1,2) = gi(ib,lmt1,2)
                     gr(ib,lmt1,3) = gi(ib,lmt1,3)
                     gi(ib,lmt1,1) = -tmp1
                     gi(ib,lmt1,2) = -tmp2
                     gi(ib,lmt1,3) = -tmp3
                  end do
               endif
            enddo
         end if
         call tstatc0_end(id_sname)
      end subroutine drv_of_betar_dot_WFs_div
  end subroutine m_Force_term_drv_of_flmt_3D
! -->
#endif
