#define NEC_TUNE
!=======================================================================
!
!  PROGRAM  PHASE/0 2014.01 (rev.375)
!
!  "First-principles Electronic Structure Calculation Program"
!
!  MODULE: m_NonLocal_Potential
!
!  AUTHOR(S): T. Yamasaki   August/20/2003
!  
!  Contact address :  Phase System Consortium
!                     E-mail: phase_system@nims.go.jp URL https://azuma.nims.go.jp
!  
!
!
!=======================================================================
!
!   Since 2002, this program set had been intensively developed as a part of the following 
!  national projects supported by the Ministry of Education, Culture, Sports, Science and 
!  Technology (MEXT) of Japan; "Frontier Simulation Software for Industrial Science 
!  (FSIS)" from 2002 to 2005, "Revolutionary Simulation Software (RSS21)" from 2006 to 
!  2008. "Research and Development of Innovative Simulation Software (RISS)" from 2008 
!  to 2013. These projects is lead by the Center for Research on Innovative Simulation 
!  Software (CISS), the Institute of Industrial Science (IIS), the University of Tokyo.
!   Since 2013, this program set has been further developed centering on PHASE System 
!  Consortium. 
!   The activity of development of this program set has been supervised by Takahisa Ohno.
!
module m_NonLocal_Potential
!    ( m_NLP )
!  $Id: m_NonLocal_Potential.F90 348 2013-11-14 07:50:23Z tkato $
!
!  This module contains following major subroutines
!  1.  m_NLP_alloc_snl
!  2.  m_NLP_wd_snl(nfcntn_bin,kv3)
!  3.  m_NLP_rd_snl(nfcntn_bin,kv3)
!  4.  m_NLP_betar_dot_PWs(nfout,kv3,vkxyz)
!  5.  m_NLP_betar_dot_PWs_diff(nfout,kv3,vkxyz)
!
! The subroutine "m_NLP_betar_dot_WFs" originates from a subroutine "kbint".
! The subroutine "kbint" was coded mainly by Y. Morikawa in 1993 or earlier.
! The subroutine "kbint" has following comments
!!!$C Fourier transformation of function beta(r).
!!$c  #1) gx,gy,gz --> ngabc by T.Yamasaki on 15th Feb. 1995
!!$c  #2) vx,vy,vz --> vkxyz by T.Yamasaki on 15th Feb. 1995
!!$c  #3) betar --> betar_l  by T.Yamasaki on 3rd Mar. 1995
!!$c  #4) snl2 is introduced by Y. Morikawa. 14th May. 1996
!!$c  #5) antiferromagnetic calculation is added on 9th Jul. 1996
!!$c                                          by H.Sawada
!
! The subroutine "m_NLP_betar_dot_PWs_diff" originates from a subroutine
! "kbint_diff" coded by H. Sawada at 8th May 1997.
! This was translated into the subroutine "m_NLP_betar_dot_PWs_diff" using
! fortran90+MPI by H. Sawada and T. Yamasaki in 1999.
!
  use m_PlaneWaveBasisSet,    only : kgp,kg1,ngabc,iba,nbase
  use m_PseudoPotential,      only : wos,radr,betar,nmesh,mmesh,ilmt,xh,rmax,ntau &
       &                           , m_PP_tell_lmtt_l_m_tau,nlmtt &
       &                           , m_PP_tell_lmtt_l_m_tau_phi &
       &                           , m_PP_tell_lmtt_l_m_tau_add &
       &                           , m_PP_tell_lmtt_l_m_tau_pao &
       &                           , phirt,ilmt_phi, nlmtt_phi &
       &                           , betar_add,ilmt_add, nlmtt_add &
       &                           , paor,ilmt_pao, nlmtt_pao
  use m_Crystal_Structure,    only : rltv, univol, op
  use m_Ionic_System,         only : ntyp
  use m_Timing,               only : tstatc0_begin, tstatc0_end
  use m_Control_Parameters,   only : nspin, ipri, iprisnl, istress &
       &                           , sw_orb_popu, ipriphig, ipripao
  use m_Const_Parameters,     only : DP, PAI2, PAI4, BUCS, CRDTYP, ON, DELTA
  use m_Parallelization,      only : map_k,myrank_k,ista_k,iend_k,mype &
       &                           , myrank_e,nrank_e,mpi_k_world,ierr &
       &                           , ista_snl, iend_snl, mpi_comm_group
  use m_Parallelization,      only : np_g1k,ista_g1k, iend_g1k    &
       &                           , mpi_kg_world,mpi_ke_world 
  use m_PlaneWaveBasisSet,   only : ngabc_kngp_l, ngabc_kngp_B_l,kg

! ================================ added by K. Tagami ============== 11.0
  use m_Control_Parameters,   only : noncol, ndim_spinor
! ================================================================== 11.0

  implicit none
  include 'mpif.h'                              ! MPI
  integer istatus(mpi_status_size)              ! MPI

  real(kind=DP), allocatable,target,dimension(:,:,:)     :: snl  !d(kg1,nlmtt,ista_snl:iend_snl)
  real(kind=DP), allocatable,target,dimension(:,:,:,:,:) :: snld !d(kg1,nlmtt,ista_snl:iend_snl,3)

  real(kind=DP), allocatable,dimension(:,:,:)     :: snl_add  !d(kg1,nlmtt_add,ista_snl:iend_snl)

  real(kind=DP),private,allocatable,dimension(:) :: qx,qy,qz,vlength,snl2,wka,wkb,ylm
  real(kind=DP),private,allocatable,dimension(:)     :: snl3,wkc,wkd
  real(kind=DP),private,allocatable,dimension(:,:)   :: ylmd,alinvt

  real(kind=DP), allocatable,dimension(:,:,:) :: phig !d(kg1,nlmtt_phi,ista_snl:iend_snl)
  real(kind=DP), allocatable,dimension(:,:) :: norm_phig !d(nlmtt_phi,ista_snl:iend_snl)

  real(kind=DP), allocatable,dimension(:,:,:) :: paog !d(kg1,nlmtt_pao,ista_snl:iend_snl)
contains
  subroutine m_NLP_alloc_snl
!!$    ista_snl = (ista_k + nspin - 1)/nspin
!!$    iend_snl = iend_k/nspin
!!$    print '(" ista_snl = ",i3)',ista_snl
!!$    print '(" iend_snl = ",i3)',iend_snl
    allocate(snl(maxval(np_g1k),nlmtt,ista_snl:iend_snl)); snl = 0.d0
    if(istress == ON) then

       allocate(snld(maxval(np_g1k),nlmtt,ista_snl:iend_snl,3,3)); snld = 0.d0
    end if
  end subroutine m_NLP_alloc_snl

  subroutine m_NLP_alloc_phig
    if(sw_orb_popu == ON) then
       allocate(phig(maxval(np_g1k),nlmtt_phi,ista_snl:iend_snl)); phig = 0.d0
       allocate(norm_phig(nlmtt_phi,ista_snl:iend_snl)); norm_phig = 1.d0
    end if
  end subroutine m_NLP_alloc_phig
 
  subroutine m_NLP_alloc_snl_add

    allocate(snl_add(maxval(np_g1k),nlmtt_add,ista_snl:iend_snl)); snl_add = 0.d0
  end subroutine m_NLP_alloc_snl_add

  subroutine m_NLP_alloc_paog
    allocate(paog(maxval(np_g1k),nlmtt_pao,ista_snl:iend_snl)); paog = 0.d0
  end subroutine m_NLP_alloc_paog




    subroutine wd_lmt_l_m_tau_etc(nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
    integer, intent(in)  :: nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher
      write(nfout,230) it,lmt1,il1,im1,tau1,lmtt1,nspher
230   format(' ',' it=',i2,' lmt1=',i2,' il1,im1,tau1=',3i2, &
           &                   ' lmtt1=',i3,' nspher=',i3)
    end subroutine wd_lmt_l_m_tau_etc

    subroutine new_radr_and_wos(ik,it)
      integer, intent(in)  :: ik,it
      real(kind=DP), parameter :: delta = 1.d-40
      real(kind=DP)      :: hn
      real(kind=DP),save :: xhn = 0.d0 , rmaxn = 0.d0
      integer,save       :: nmeshn = 0
      if((ik == ista_k .and. it == 1) .or. &            ! MPI
           & (nmeshn /= nmesh(it) .or. dabs(xhn-xh(it)) > delta  .or. &
           &  dabs(rmaxn-rmax(it)) > delta) ) then
         call rmeshs(nmesh(it),nmesh(it),xh(it),rmax(it),radr,hn) ! -(b_PP)
         call coef_simpson_integration(nmesh(it),nmesh(it),xh(it),radr,wos) ! -(b_PP)
         xhn = xh(it); rmaxn = rmax(it); nmeshn = nmesh(it)
      endif
    end subroutine new_radr_and_wos

    subroutine radr_and_wos_p(n1,n2,mtyp,mradr,ip_radr,radr_p,wos_p)
      integer, intent(in) :: n1, n2, mtyp,mradr
      integer, intent(in), dimension(mtyp) :: ip_radr
      real(kind=DP), intent(out), dimension(n1:n2,mradr) :: radr_p, wos_p

      real(kind=DP)      :: hn
      integer :: it, i, ip, ip0
      ip = 0
      do it = 1, ntyp
         if(ntyp > mtyp) cycle
         if(ip_radr(it) /= ip) then
            call rmeshs(nmesh(it),nmesh(it),xh(it),rmax(it),radr,hn) ! -(b_PP)
            call coef_simpson_integration(nmesh(it),nmesh(it),xh(it),radr,wos) ! -(b_PP)
            ip = ip_radr(it)
            do i = n1, n2
               radr_p(i,ip) = radr(i)
               wos_p(i,ip) = wos(i)
            end do
         end if
      end do
    end subroutine radr_and_wos_p

    subroutine find_critical_point(r,n,x,idp)
      real(kind=DP), intent(in) :: r
      integer,  intent(in)      :: n
      real(kind=DP), intent(in), dimension(n) :: x
      integer,  intent(out)     :: idp
      integer :: i
      do i = 1, n
         idp = i
         if(x(i) > r) exit
      end do
    end subroutine find_critical_point


  subroutine m_NLP_dealloc
    if(allocated(snl)) deallocate(snl)
    if(allocated(snl_add)) deallocate(snl_add)          

    if(allocated(paog)) deallocate(paog)          
    if(istress == ON) deallocate(snld)
  end subroutine m_NLP_dealloc

!===============================================================================
  subroutine m_NLP_betar_dot_PWs_3D(nfout,kv3,vkxyz)
    integer,       intent(in)       :: nfout,kv3
    real(kind=DP), intent(in)       :: vkxyz(kv3,3,CRDTYP)

    real(kind=DP)       :: fac, facr
    integer             :: ik,iksnl,it,n,lmt1,lmtt1,il1,im1,tau1,nspher, ig, n1, n2
    integer             :: i, j, k
    integer             :: id_sname = -1
    real(kind=DP), dimension(:,:), allocatable :: snl2_mpi, snl1

#ifndef _mNLP_no_loop_exchange_
    integer, parameter  :: lcmax = 4
    integer :: ip
    integer :: mil, mp  ! mp = maxval(np)
    integer, dimension(:),     allocatable :: nil  ! d(ntyp)
    integer, dimension(:,:),   allocatable :: nt   ! d(ntyp,mil)
    integer, dimension(:,:,:), allocatable :: tu2p ! d(ntau,ntyp,mil)
    integer, dimension(:),     allocatable :: np   ! d(mil), np=sum_{u=1}^{ntyp}(nt(u,mil))
    integer, dimension(:,:,:), allocatable ::  pm2lmtt1 ! d(mp,2*(mil-1)+1,mil)
    real(kind=DP), dimension(:,:), allocatable :: snl_t, snl_t_mpi ! d(iba(ik),mp)
    real(kind=DP), dimension(:,:), allocatable :: radr_p, wos_p ! d(n1:n2,ntyp)
    integer :: p, mradr_indp 
    logical, dimension(:,:), allocatable :: flag_radr !d(ntyp,mil)
    integer, dimension(:), allocatable   :: ip_radr !d(ntyp)
    real(kind=DP) :: r, w
#endif

#ifdef __TIMER_SUB__
  call timer_sta(1224)
#endif

    call tstatc0_begin('m_NLP_betar_dot_PWs ',id_sname,1)
    call innerPr_allocate_3D()

    fac = PAI4/dsqrt(univol)

#ifndef _mNLP_no_loop_exchange_
!!$    allocate(nil(ntyp)); nil = 0
!!$    allocate(nt(ntyp,lcmax)); allocate(tu2p(ntau,ntyp,lcmax)); allocate(np(lcmax))
!!$    nt = 0; tu2p = 0
!!$    call set_index_arrays1(ntyp,nil,nt,tu2p,mil,mp) ! mil, mp, nil, nt, tu2p, np, pm2lmtt1, contained here
!!$    allocate(pm2lmtt1(mp,2*(mil-1)+1,mil))
!!$    call set_index_arrays2(mp,pm2lmtt1)
    call set_index_arrays() ! mil, mp, nil, nt, tu2p, np, pm2lmtt1, contained here

    n = ceiling(dble(mmesh)/nrank_e)
    if(n == 0) n = 1
    n1 = n*myrank_e + 1
    n2 = n*(myrank_e+1)
    if(n2 > mmesh) n2 = mmesh

    allocate(flag_radr(ntyp,mil))
    allocate(ip_radr(ntyp))
!!$ASASASASAS
    flag_radr = .false.
    ip_radr = 0
!!$ASASASASAS
    call set_flag_radr_and_ip_radr() ! -> mradr_indp,flag_radr, ip_radr

    if(n1 <= n2) then
       allocate(radr_p(n1:n2,mradr_indp))
       allocate(wos_p(n1:n2,mradr_indp))
!!$ASASASASAS
       radr_p = 0; wos_p = 0
!!$ASASASASAS
       call radr_and_wos_p(n1,n2,ntyp,mradr_indp,ip_radr,radr_p,wos_p) ! -> radr_p, wos_p
    end if

    do ik = 1, kv3, nspin
       if(ipri >= 2) write(nfout,'(" ik = ",i8)') ik
       if(map_k(ik) /= myrank_k) cycle                     ! MPI
!!$ASASASASAS
!!$       allocate(snl_t(iba(ik),mp))
!!$       if(nrank_e > 1) allocate(snl_t_mpi(iba(ik),mp))
       allocate(snl_t(np_g1k(ik),mp)); snl_t = 0
       if(nrank_e > 1) then
          allocate(snl_t_mpi(np_g1k(ik),mp)); snl_t_mpi = 0
       endif
!!$ASASASASAS

       if(ipri >= 2) write(nfout,'(" entering k_plus_G_vectors")')
       if(ipri >= 2) then
          write(nfout,'(" iba(ik) = ",i8)') iba(ik)
          write(nfout,'(" vkxyz(ik) = ",3f8.4)') vkxyz(ik,1:3,BUCS)
       end if
       if(.not.allocated(qx)) stop ' qx is not allocated'
       if(.not.allocated(vlength)) stop ' vlength is not allocated'
       if(.not.allocated(nbase)) stop ' nbase is not allocated'
!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       iksnl = (ik-1)/nspin + 1

       do il1 = 1, mil
          snl_t = 0.d0
          do n = n1, n2
             do it = 1, ntyp
                if(il1 > nil(it)) cycle
                if(flag_radr(it,il1)) then
                   r = radr_p(n,ip_radr(it))
                   do ig = 1, np_g1k(ik)
                      wka(ig) = vlength(ig)*r
                   end do
                   call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                end if

                w = fac * wos_p(n,ip_radr(it)) * radr_p(n,ip_radr(it))
                do tau1 = 1, nt(it,il1)
                   facr = w * betar(n,il1,tau1,it)
                   ip = tu2p(tau1,it,il1)
                   do ig = 1, np_g1k(ik)
                      snl_t(ig,ip) = snl_t(ig,ip) + facr*wkb(ig)
                   end do
                end do
             end do
          end do
          if(nrank_e > 1) then
             call mpi_allreduce(snl_t,snl_t_mpi,np_g1k(ik)*np(il1),mpi_double_precision,mpi_sum,mpi_kg_world,ierr)
             snl_t = snl_t_mpi
          end if

          do im1 = 1, 2*(il1-1)+1
             nspher = (il1-1)**2 + im1
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             do p = 1, np(il1)
                ip = pm2lmtt1(p,im1,il1)
                do ig = 1, np_g1k(ik)
                   snl(ig,ip,iksnl) = snl_t(ig,p)*ylm(ig)
                end do
             end do
          end do
       end do
       if(nrank_e > 1) deallocate(snl_t_mpi)
       deallocate(snl_t)
    end do

    if(n1 <= n2) deallocate(wos_p,radr_p)
    deallocate(ip_radr,flag_radr)
    call dealloc_index_arrays()

#else
    if(nrank_e > 1) then
       allocate(snl1(kg1,nlmtt)); snl1 = 0.d0
       allocate(snl2_mpi(kg1,nlmtt))
!!$ASASASASAS
       snl2_mpi = 0.d0
!!$ASASASASAS
    end if

    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle                     ! MPI

       call k_plus_G_vectors(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       iksnl = (ik-1)/nspin + 1

       do it=1,ntyp

          n = ceiling(dble(nmesh(it))/nrank_e)
          if(n == 0) n = 1
          n1 = n*myrank_e + 1
          n2 = n*(myrank_e+1)
          if(n2 > nmesh(it)) n2 = nmesh(it)

          call new_radr_and_wos(ik,it)                 ! --> radr, wos
          do lmt1 = 1,ilmt(it)
             call m_PP_tell_lmtt_l_m_tau(lmt1,it,lmtt1,il1,im1,tau1,nspher)
             call sphr(iba(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             if(iprisnl >= 2) call wd_lmt_l_m_tau_etc &
                                (nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
             snl2 = 0.d0

             do n = n1, n2
                facr = fac*wos(n)*radr(n)*betar(n,il1,tau1,it)
                do ig = 1, iba(ik)
                   wka(ig) = vlength(ig)*radr(n)
                end do
                call dsjnv(il1-1,iba(ik),wka,wkb)     ! -(bottom_Subr.)
                do ig = 1, iba(ik)
                   snl2(ig) = snl2(ig) + facr*wkb(ig)
                end do
             end do

             if(nrank_e == 1) then
                do ig = 1, iba(ik)
                   snl(ig,lmtt1,iksnl) = snl2(ig)*ylm(ig)
                end do
             else
                do ig = 1, iba(ik)
                   snl1(ig,lmtt1) = snl2(ig)*ylm(ig)
                end do
             end if
          end do
       end do

       if(nrank_e > 1) then
          call mpi_allreduce(snl1, snl2_mpi, kg1*nlmtt,mpi_double_precision,mpi_sum,mpi_k_world(myrank_k),ierr)
          do lmtt1 = 1, nlmtt
             do ig = 1, iba(ik)
                snl(ig,lmtt1,iksnl) = snl2_mpi(ig,lmtt1)
             end do
          end do
       end if
    end do
    if(nrank_e > 1) deallocate(snl2_mpi,snl1)
#endif
    if(iprisnl >= 2) call wd_snl

    call innerPr_deallocate_3D
    call tstatc0_end(id_sname)
#ifdef __TIMER_SUB__
  call timer_end(1224)
#endif

  contains
#ifndef _mNLP_no_loop_exchange_
    subroutine set_flag_radr_and_ip_radr()
      integer :: it, il, ip, it2, is, it1

      mradr_indp = 1
      ip_radr(1) = 1
      do it = 2, ntyp
         is = 0
         search_loop: do it2 = 1, mradr_indp
            ip = ip_radr(it2)
            if((nmesh(ip) == nmesh(it) .and. dabs(xh(ip)-xh(it)) < DELTA .and. &
              & dabs(rmax(ip)-rmax(it)) < DELTA)) then
               is = it2
               exit search_loop
            end if
         end do search_loop
         if(is <= 0 .or. is > mradr_indp) then
            mradr_indp = mradr_indp+1
            is = mradr_indp
         end if
         ip_radr(it) = is
      end do

      if(ipri >= 2) then
         write(nfout,'(" !mNLP : mradr_indp = ", i5)') mradr_indp
         do it = 1, ntyp
            write(nfout,'(" !mNLP : ip_radr(",i5,") = ",i5)') it, ip_radr(it)
         end do
      end if

      do il = 1, mil
         flag_radr(1:ntyp,il) = .true.
         it1 = 0
         do it = 1, ntyp
            if(il > nil(it)) cycle
            it1 = it1+1
            if(it1 == 1) then
               flag_radr(it,il) = .true.
            else if(it1 >= 2) then
               if(ip_radr(ip) /= ip_radr(it)) then
                  flag_radr(it,il) = .true.
               else
                  flag_radr(it,il) = .false.
               end if
            end if
            ip = it
         end do
      end do

      if(ipri >= 2) then
         do il = 1, mil
            do it = 1, ntyp
               write(nfout,'(" !mNLP : flag_radr(",i5,",",i5,") = ",l3)') it,il,flag_radr(it,il)
            end do
         end do
      end if
    end subroutine set_flag_radr_and_ip_radr

    subroutine set_index_arrays()
      integer :: it, lmt1, n, il1, im1

      allocate(nil(ntyp)); nil = 0
      allocate(nt(ntyp,lcmax)); allocate(tu2p(ntau,ntyp,lcmax)); allocate(np(lcmax))
      nt = 0; tu2p = 0
      mil = 0
      if(ipri>=2) write(nfout,'(" !mNLP:    it, lmt1: lmtt1, il1, im1, tau1, nspher")')
      do it=1,ntyp
         do lmt1 = 1, ilmt(it)
            call m_PP_tell_lmtt_l_m_tau(lmt1,it,lmtt1,il1,im1,tau1,nspher)
            if(ipri>=2) write(nfout,'(" !mNLP: ",2i5," : ",5i5)') it,lmt1, lmtt1,il1,im1,tau1,nspher
            if(mil < il1) mil = il1
            if(nil(it) < il1) nil(it) = il1
            if(nt(it,il1) < tau1) nt(it,il1) = tau1
         end do
      end do

      do il1=1,mil
         n = 0
         np(il1) = 0
         do it=1,ntyp
            if(il1 > nil(it)) cycle
            np(il1) = np(il1)+nt(it,il1)
            do tau1=1,nt(it,il1)
               n = n+1
               tu2p(tau1,it,il1)= n
            end do
         end do
      end do

      mp = maxval(np(1:mil))
      allocate(pm2lmtt1(mp,2*(mil-1)+1,mil))
!!$ASASASASAS
      pm2lmtt1 = 0
!!$ASASASASAS
      do it = 1, ntyp
         do lmt1=1,ilmt(it)
            call m_PP_tell_lmtt_l_m_tau(lmt1,it,lmtt1,il1,im1,tau1,nspher)
            n = tu2p(tau1,it,il1)
            pm2lmtt1(n,im1,il1) = lmtt1
         end do
      end do

      if(ipri>=2) then
         write(nfout,'(" !mNLP: mil = ",i5)') mil
         do it = 1, ntyp
            write(nfout,'(" !mNLP: nil(",i3,") = ",i5)') it,nil(it)
         end do
         do it = 1, ntyp
            write(nfout,'(" !mNLP: nt(",i3,", : ) = ",5i5)') it,(nt(it,il1),il1=1,nil(it))
         end do

         write(nfout,'(" !mNLP: mp = ",i5)') mp
         do il1 = 1, mil
            write(nfout,'(" !mNLP: np(",i3,")=",i5)') il1, np(il1)
         end do
         do il1 = 1, mil
            do it = 1, ntyp
               if(il1 > nil(it)) cycle
               do tau1 = 1, nt(it,il1)
                  write(nfout,'(" !mNLP: tu2p(",i3,",",i3,",",i3,") = ",i5)') tau1,it,il1,tu2p(tau1,it,il1)
               end do
            end do
         end do
         do il1 = 1, mil
            do im1 = 1, 2*(il1-1)+1
               do n = 1, np(il1)
                  write(nfout,'(" !mNLP: il1 = ",i3," im1 = ",i3," n = ", i3, " pm2lmtt1 = ",i5)') &
                       & il1,im1, n,pm2lmtt1(n,im1,il1)
               end do
            end do
         end do
      end if
    end subroutine set_index_arrays

    subroutine dealloc_index_arrays()
      deallocate(pm2lmtt1,np,tu2p,nt,nil)
    end subroutine dealloc_index_arrays
#endif
      
    subroutine wd_snl
      integer, parameter :: MSNLSIZE = 20
      integer :: i, ilmtt, ik, j, iksnl, icycle, icolumn, max_elements, istart, iend, ic
      write(nfout,'(" << m_NLP_betar_dot_PWs.wd_snl >>")')
      write(nfout,'(10("(",3i2,")"))') ((ngabc(i,j),j=1,3),i=1,30)
      do ik = 1, kv3, nspin
         if(map_k(ik) /= myrank_k) cycle        ! MPI
         iksnl = (ik-1)/nspin + 1
         write(nfout,'(" ik = ",i5)') iksnl
         write(nfout,'(" nbase(1:8,",i5,")",i8,9i12)') ik,(nbase(i,ik),i=1,8)
         if(iprisnl >= 3) then
            max_elements = iba(ik)
         else
            max_elements = min(MSNLSIZE,iba(ik))
         end if
         icolumn = 10
         icycle = ceiling(dble(min(max_elements,kg1))/icolumn)
         do ilmtt = 1, nlmtt
!!$            write(nfout,'("(ilmtt = ",i5,")",8f10.5,99(/15x,8f10.5))') ilmtt,(snl(i,ilmtt,iksnl),i=1,kg1)
            write(nfout,'(" !nlp (ilmtt = ",i5,")")') ilmtt
            istart = 1
            do ic = 1, icycle
               iend = min(istart+icolumn-1,max_elements,kg1)
               write(nfout,'(" !nlp (nx)    ",10i12)') (ngabc(nbase(i,ik),1),i=istart,iend)
               write(nfout,'(" !nlp (ny)    ",10i12)') (ngabc(nbase(i,ik),2),i=istart,iend)
               write(nfout,'(" !nlp (nz)    ",10i12)') (ngabc(nbase(i,ik),3),i=istart,iend)
               write(nfout,'(" !nlp (snl)   ",10d12.4)') (snl(i,ilmtt,iksnl),i=istart,iend)
               istart = iend + 1
            end do
         end do
      end do
    end subroutine wd_snl
  end subroutine m_NLP_betar_dot_PWs_3D

  subroutine innerPr_allocate_3D
    allocate(qx(maxval(np_g1k))); qx = 0.d0
    allocate(qy(maxval(np_g1k))); qy = 0.d0
    allocate(qz(maxval(np_g1k))); qz = 0.d0
    allocate(vlength(maxval(np_g1k))); vlength = 0.d0
    allocate(snl2(maxval(np_g1k))); snl2 = 0.d0
    allocate(wka(maxval(np_g1k)));  wka  = 0.d0
    allocate(wkb(maxval(np_g1k)));  wkb  = 0.d0
    allocate(ylm(maxval(np_g1k)));  ylm  = 0.d0
  end subroutine innerPr_allocate_3D

  subroutine innerPr_deallocate_3D
    deallocate(ylm)
    deallocate(wkb)
    deallocate(wka)
    deallocate(snl2)
    deallocate(vlength)
    deallocate(qz)
    deallocate(qy)
    deallocate(qx)
  end subroutine innerPr_deallocate_3D

!------
  subroutine innerPr1_allocate_3D
    allocate(qx(maxval(np_g1k))); qx = 0.d0
    allocate(qy(maxval(np_g1k))); qy = 0.d0
    allocate(qz(maxval(np_g1k))); qz = 0.d0
    allocate(vlength(maxval(np_g1k))); vlength = 0.d0
    allocate(snl2(maxval(np_g1k))); snl2 = 0.d0
    allocate(wka(maxval(np_g1k)));  wka  = 0.d0
    allocate(wkb(maxval(np_g1k)));  wkb  = 0.d0
    allocate(wkc(maxval(np_g1k)));  wkc  = 0.d0
    allocate(wkd(maxval(np_g1k)));  wkd  = 0.d0
    allocate(ylm(maxval(np_g1k)));  ylm  = 0.d0
    allocate(snl3(maxval(np_g1k))); snl3 = 0.d0
    allocate(ylmd(maxval(np_g1k),3)); ylmd = 0.d0
    allocate(alinvt(3,3)); alinvt = 0.d0
  end subroutine innerPr1_allocate_3D

  subroutine innerPr1_deallocate_3D
    deallocate(alinvt)
    deallocate(ylmd)
    deallocate(snl3)
    deallocate(ylm)
    deallocate(wkd)
    deallocate(wkc)
    deallocate(wkb)
    deallocate(wka)
    deallocate(snl2)
!!$    deallocate(wos)
!!$    deallocate(radr)
    deallocate(vlength)
    deallocate(qz)
    deallocate(qy)
    deallocate(qx)
  end subroutine innerPr1_deallocate_3D

  subroutine m_NLP_betar_dot_PWs_diff_3D(nfout,kv3,vkxyz)

    integer,       intent(in)       :: nfout,kv3
    real(kind=DP), intent(in)       :: vkxyz(kv3,3,CRDTYP)

    real(kind=DP)       :: fac, facr
    integer             :: ik,iksnl,it,n,lmt1,lmtt1,il1,im1,tau1,nspher,j, ig
    integer             :: id_sname = -1

    call tstatc0_begin('m_NLP_betar_dot_PWs_diff ',id_sname,1)
    if(iprisnl >= 2) write(nfout,*) ' <<< m_NLP_betar_dot_PWs_diff >>>'
    call innerPr1_allocate_3D
    if(iprisnl >= 2) write(nfout,*) ' after innerPr1_allocate'

    alinvt = rltv / PAI2
    fac = PAI4/dsqrt(univol)
    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle             ! MPI

!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)

       iksnl = (ik-1)/nspin + 1
       do it=1,ntyp
          call new_radr_and_wos(ik,it)                 ! --> radr, wos
          do lmt1 = 1,ilmt(it)
             call m_PP_tell_lmtt_l_m_tau(lmt1,it,lmtt1,il1,im1,tau1,nspher)
             ylm = 0.d0; ylmd = 0.d0
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             call sphr_diff(maxval(np_g1k),np_g1k(ik),nspher,qx,qy,qz,ylmd) ! -(bottom_Subr.)
             if(ipri >= 2) call wd_lmt_l_m_tau_etc &
                                (nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
             snl2 = 0.d0; snl3 = 0.d0
             wkb = 0.d0; wkc = 0.d0; wkd = 0.d0
!xocl spread do/ind_kmesh
             do n = 1,nmesh(it)
                facr = fac*wos(n)*radr(n)*betar(n,il1,tau1,it)
                wka = vlength*radr(n)
                if(il1-1.ne.0) call dsjnv(il1-2,np_g1k(ik),wka,wkc)
                !                                            -(bottom_Subr.)
                call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                call dsjnv(il1  ,np_g1k(ik),wka,wkd)     ! -(bottom_Subr.)
                do ig = 1, np_g1k(ik)
                   snl2(ig) = snl2(ig) + facr*wkb(ig)
                end do
!!$                snl2 = snl2 + facr*wkb
                if(il1-1 == 0) then
                   do ig = 1, np_g1k(ik)
                      snl3(ig) = snl3(ig) - facr*radr(n)/(2.d0*il1-1.d0)*il1*wkd(ig)
                   end do
                else
                   do ig =1, np_g1k(ik)
                      snl3(ig) = snl3(ig) + &
                           &  facr*radr(n)/(2.d0*il1-1.d0)*((il1-1)*wkc(ig)-il1*wkd(ig))
                   end do
                endif
             end do
!xocl end spread sum(snl2,snl3)
             snl(1:np_g1k(ik),lmtt1,iksnl) = snl2(1:np_g1k(ik)) *  &
    &                               ylm(1:np_g1k(ik))
             where( vlength > 1.d-15 )
                snl3 = - snl3 * ylm / vlength
             elsewhere
                snl3 = 0.d0
             end where

             do j = 1,3
                do ig = 1, np_g1k(ik)
                   snld(ig,lmtt1,iksnl,1,j) = &
                        & snl3(ig)*qx(ig)*(qx(ig)*alinvt(1,j) &
                        &                      +qy(ig)*alinvt(2,j) &
                        &                      +qz(ig)*alinvt(3,j)) &
                        & -snl2(ig)*qx(ig)*(ylmd(ig,1)*alinvt(1,j) &
                        &                       +ylmd(ig,2)*alinvt(2,j) &
                        &                       +ylmd(ig,3)*alinvt(3,j)) &
                        & -snl2(ig)*ylm(ig)/2.d0*alinvt(1,j)
                   snld(ig,lmtt1,iksnl,2,j) = &
                        & snl3(ig)*qy(ig)*(qx(ig)*alinvt(1,j) &
                        &                      +qy(ig)*alinvt(2,j) &
                        &                      +qz(ig)*alinvt(3,j)) &
                        & -snl2(ig)*qy(ig)*(ylmd(ig,1)*alinvt(1,j) &
                        &                       +ylmd(ig,2)*alinvt(2,j) &
                        &                       +ylmd(ig,3)*alinvt(3,j)) &
                        & -snl2(ig)*ylm(ig)/2.d0*alinvt(2,j)
                   snld(ig,lmtt1,iksnl,3,j) = &
                        & snl3(ig)*qz(ig)*(qx(ig)*alinvt(1,j) &
                        &                      +qy(ig)*alinvt(2,j) &
                        &                      +qz(ig)*alinvt(3,j)) &
                        & -snl2(ig)*qz(ig)*(ylmd(ig,1)*alinvt(1,j) &
                        &                       +ylmd(ig,2)*alinvt(2,j) &
                        &                       +ylmd(ig,3)*alinvt(3,j)) &
                        & -snl2(ig)*ylm(ig)/2.d0*alinvt(3,j)
                end do
             end do
          end do
       end do
    end do
    if(ipri >= 2) call wd_snld
    call innerPr1_deallocate_3D
    call tstatc0_end(id_sname)

  contains
    subroutine wd_snld
      integer :: i, ilmtt, ik, j, iksnl, k
      write(nfout,'(" << m_NLP_betar_dot_PWs_diff.wd_snl >>")')
      write(nfout,'(10(''('',3i2,'')''))') ((ngabc(i,j),j=1,3),i=1,30)
      do ik = 1, kv3, nspin
         if(map_k(ik) /= myrank_k) cycle   ! MPI
         iksnl = (ik-1)/nspin + 1
         write(nfout,'(" ik = ",i5)') iksnl
         write(nfout,'(8i3)') (nbase(i,ik),i=1,8)
         do ilmtt = 1, nlmtt
	    write(nfout,'('' ilmtt ='',i3)') ilmtt
            write(nfout,'(8f10.5)') (snl(i,ilmtt,iksnl),i=1,8)
              do i = 1,3
              do j = 1,3
              write(nfout,'(8f10.5)') (snld(k,ilmtt,IKSNL,i,j),k=1,8)
              end do
              end do
         end do
      end do
    end subroutine wd_snld
  end subroutine m_NLP_betar_dot_PWs_diff_3D

  subroutine m_NLP_phir_dot_PWs_3D(nfout,kv3,vkxyz)

    integer,       intent(in)       :: nfout,kv3
    real(kind=DP), intent(in)       :: vkxyz(kv3,3,CRDTYP)

    real(kind=DP)       :: fac, facr
    integer             :: ik,ikphig,it,n,lmt1,lmtt1,il1,im1,tau1,nspher
    integer             :: iopr
    integer             :: id_sname = -1

    call tstatc0_begin('m_NLP_phir_dot_PWs ',id_sname,1)
    if(ipriphig >= 2) &
         & write(nfout,*) ' <<< m_NLP_phir_dot_PWs >>> START'
    call innerPr_allocate_3D()

    fac = PAI4/dsqrt(univol)
    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle                     ! MPI
!!$       write(nfout,*) ' ik = ', ik
!!$       write(nfout,*) ' kgp,kg1,kv3 = ',kgp,kg1,kv3
!!$       write(nfout,*) ' allocated(iba)   = ',allocated(iba)
!!$       write(nfout,*) ' allocated(nbase) = ',allocated(nbase)
!!$       write(nfout,*) ' allocated(ngabc) = ',allocated(ngabc)
!!$       write(nfout,*) ' allocated(qx)    = ',allocated(qx)
!!$       write(nfout,*) ' allocated(vlength)=',allocated(vlength)

!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!               &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
               &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       ikphig = (ik-1)/nspin + 1
       do it=1,ntyp
          call new_radr_and_wos(ik,it)                 ! --> radr, wos
          do lmt1 = 1,ilmt_phi(it)
             call m_PP_tell_lmtt_l_m_tau_phi(lmt1,it,lmtt1,il1,im1,tau1,nspher)
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             if(ipri >= 2) call wd_lmt_l_m_tau_etc &
                                (nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
             snl2 = 0.d0

             if(.not.allocated(wos)) then
                write(nfout,'(" wos is not allocated")')
                stop ' wos is not allocated'
             end if
             if(.not.allocated(radr)) then
                write(nfout,'(" radr is not allocated")')
                stop ' wos is not allocated'
             end if
             if(ipri>=2) then
                write(nfout,'(" nmesh(it) = ",i8)') nmesh(it)
             end if
             if(.not.allocated(wka)) stop ' wka is not allocated'
             if(.not.allocated(wkb)) stop ' wkb is not allocated'
             if(.not.allocated(phirt)) stop ' phirt is not allocated'
             do n = 1,nmesh(it)
                facr = fac*wos(n)*radr(n)*phirt(n,il1,tau1,it)
                wka = vlength*radr(n)
!!$                call find_critical_point(1.d0,iba(ik),wka,idp)
!!$                call dsjnvn(il1-1,iba(ik),wka,idp,wkb)     ! -(bottom_Subr.)
                call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                snl2 = snl2 + facr*wkb*ylm
             end do
             norm_phig(lmtt1,ikphig) = sum(snl2(1:np_g1k(ik))*snl2(1:np_g1k(ik)))
             phig(1:np_g1k(ik),lmtt1,ikphig) = snl2(1:np_g1k(ik))/sqrt(norm_phig(lmtt1,ikphig))
          end do
       end do
    end do

    if(ipriphig >= 2) call wd_phig

    call innerPr_deallocate_3D
    call tstatc0_end(id_sname)

  contains
    subroutine wd_phig
      integer :: i, ilmtt, ik, j, ikphig, iopr
      write(nfout,'(" << m_NLP_phir_dot_PWs.wd_phig >>")')
      write(nfout,'(" nlmtt_phi=",i3)') nlmtt_phi
      write(nfout,'(10("(",3i2,")"))') ((ngabc(i,j),j=1,3),i=1,30)
      do ik = 1, kv3, nspin
         if(map_k(ik) /= myrank_k) cycle        ! MPI
         ikphig = (ik-1)/nspin + 1
         write(nfout,'(" ik = ",i5)') ikphig
         write(nfout,'(" nbase(1:8,",i5,")",8i10)') ik,(nbase(i,ik),i=1,8)
         do ilmtt = 1, nlmtt_phi
            write(nfout,'("(ilmtt = ",i5,")",8f10.5,99(/15x,8f10.5))') ilmtt,(phig(i,ilmtt,ikphig),i=1,kg1)
         end do
         do ilmtt = 1, nlmtt_phi
            write(nfout,'("(ilmtt = ",i5,")",f10.5)') ilmtt,norm_phig(ilmtt,ikphig)
         end do
      end do
      write(nfout,'(" << m_NLP_phir_dot_PWs.wd_phig >> END")')
    end subroutine wd_phig
  end subroutine m_NLP_phir_dot_PWs_3D

  subroutine m_NLP_add_betar_dot_PWs_3D(nfout,kv3,vkxyz)

    integer,       intent(in)       :: nfout,kv3
    real(kind=DP), intent(in)       :: vkxyz(kv3,3,CRDTYP)

    real(kind=DP)       :: fac, facr
    integer             :: ik,iksnl,it,n,lmt1,lmtt1,il1,im1,tau1,nspher
    integer             :: id_sname = -1

    call tstatc0_begin('m_NLP_add_betar_dot_PWs ',id_sname,1)
!!$    write(nfout,*) ' <<< m_NLP_add_betar_dot_PWs >>>'
    call innerPr_allocate_3D()

    fac = PAI4/dsqrt(univol)
    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle                     ! MPI
!!$       write(nfout,*) ' ik = ', ik
!!$       write(nfout,*) ' kgp,kg1,kv3 = ',kgp,kg1,kv3
!!$       write(nfout,*) ' allocated(iba)   = ',allocated(iba)
!!$       write(nfout,*) ' allocated(nbase) = ',allocated(nbase)
!!$       write(nfout,*) ' allocated(ngabc) = ',allocated(ngabc)
!!$       write(nfout,*) ' allocated(qx)    = ',allocated(qx)
!!$       write(nfout,*) ' allocated(vlength)=',allocated(vlength)

!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       iksnl = (ik-1)/nspin + 1
       do it=1,ntyp
          call new_radr_and_wos(ik,it)                 ! --> radr, wos
          do lmt1 = 1,ilmt_add(it)
             call m_PP_tell_lmtt_l_m_tau_add(lmt1,it,lmtt1,il1,im1,tau1,nspher)
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             if(ipri >= 2) call wd_lmt_l_m_tau_etc &
                                (nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
             snl2 = 0.d0
             do n = 1,nmesh(it)
                facr = fac*wos(n)*radr(n)*betar_add(n,it)
                wka(1:np_g1k(ik)) = vlength(1:np_g1k(ik))*radr(n)
!!$                call find_critical_point(1.d0,np_g1k(ik),wka,idp)
!!$                call dsjnvn(il1-1,np_g1k(ik),wka,idp,wkb)     ! -(bottom_Subr.)
                call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                snl2(1:np_g1k(ik)) = snl2(1:np_g1k(ik)) + facr*wkb(1:np_g1k(ik))*ylm(1:np_g1k(ik))
             end do
             snl_add(1:np_g1k(ik),lmtt1,iksnl) = snl2(1:np_g1k(ik))
          end do
       end do
    end do
    if(iprisnl >= 2) call wd_snl_add
    call innerPr_deallocate_3D
    call tstatc0_end(id_sname)

  contains
    subroutine wd_snl_add
      integer :: i, ilmtt, ik, j, iksnl
      write(nfout,'(" << m_NLP_add_betar_dot_PWs.wd_snl_add >>")')
      write(nfout,'(10("(",3i2,")"))') ((ngabc(i,j),j=1,3),i=1,30)
      do ik = 1, kv3, nspin
         if(map_k(ik) /= myrank_k) cycle        ! MPI
         iksnl = (ik-1)/nspin + 1
         write(nfout,'(" ik = ",i5)') iksnl
         write(nfout,'(" nbase(1:8,",i5,")",8i10)') ik,(nbase(i,ik),i=1,8)
         do ilmtt = 1, nlmtt_add
! === DEBUG by tkato 2013/10/30 ================================================
!           write(nfout,'("(ilmtt = ",i5,")",8f10.5,99(/15x,8f10.5))') ilmtt,(snl_add(i,ilmtt,iksnl),i=1,kg1)
            write(nfout,'("(ilmtt = ",i5,")",8f10.5,99(/15x,8f10.5))') ilmtt,(snl_add(i,ilmtt,iksnl),i=1,np_g1k(ik))
! ==============================================================================
         end do
      end do
    end subroutine wd_snl_add
  end subroutine m_NLP_add_betar_dot_PWs_3D

  subroutine m_NLP_paor_dot_PWs_3D(nfout,kv3,vkxyz)
    integer,       intent(in)       :: nfout,kv3
    real(kind=DP), intent(in)       :: vkxyz(kv3,3,CRDTYP)

    real(kind=DP)       :: fac, facr
    integer             :: ik,iksnl,it,n,lmt1,lmtt1,il1,im1,tau1,nspher
    integer             :: n1,n2,ig
    integer             :: id_sname = -1
    real(kind=DP), dimension(:,:), allocatable :: snl2_mpi, snl1

#ifndef _mNLP_no_loop_exchange_
    integer, parameter  :: lcmax = 4
    integer :: ip
    integer :: mil, mp  ! mp = maxval(np)
    integer, dimension(:),     allocatable :: nil  ! d(ntyp)
    integer, dimension(:,:),   allocatable :: nt   ! d(ntyp,mil)
    integer, dimension(:,:,:), allocatable :: tu2p ! d(ntau,ntyp,mil)
    integer, dimension(:),     allocatable :: np   ! d(mil), np=sum_{u=1}^{ntyp}(nt(u,mil))
    integer, dimension(:,:,:), allocatable ::  pm2lmtt1 ! d(mp,2*(mil-1)+1,mil)
    real(kind=DP), dimension(:,:), allocatable :: snl_t, snl_t_mpi ! d(iba(ik),mp)
    real(kind=DP), dimension(:,:), allocatable :: radr_p, wos_p ! d(n1:n2,ntyp)
    integer :: p, mradr_indp 
    logical, dimension(:,:), allocatable :: flag_radr !d(ntyp,mil)
    integer, dimension(:), allocatable   :: ip_radr !d(ntyp)
    real(kind=DP) :: r, w
#endif

    call tstatc0_begin('m_NLP_paor_dot_PWs ',id_sname,1)
!!$    write(nfout,*) ' <<< m_NLP_paor_dot_PWs >>>'
    call innerPr_allocate_3D()

    fac = PAI4/dsqrt(univol)
#ifndef _mNLP_no_loop_exchange_
    call set_index_arrays() ! mil, mp, nil, nt, tu2p, np, pm2lmtt1, contained here

    n = ceiling(dble(mmesh)/nrank_e)
    if(n == 0) n = 1
    n1 = n*myrank_e + 1
    n2 = n*(myrank_e+1)
    if(n2 > mmesh) n2 = mmesh

    allocate(flag_radr(ntyp,mil))
    allocate(ip_radr(ntyp))
!!$ASASASASAS
    flag_radr = .false. ;  ip_radr = 0
!!$ASASASASAS
    call set_flag_radr_and_ip_radr() ! -> mradr_indp,flag_radr, ip_radr

    if(n1 <= n2) then
       allocate(radr_p(n1:n2,mradr_indp))
       allocate(wos_p(n1:n2,mradr_indp))
!!$ASASASASAS
       radr_p = 0; wos_p = 0
!!$ASASASASAS
       call radr_and_wos_p(n1,n2,ntyp,mradr_indp,ip_radr,radr_p,wos_p) ! -> radr_p, wos_p
    end if

    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle                     ! MPI
!!$ASASASASAS
!!$       allocate(snl_t(iba(ik),mp))
!!$       if(nrank_e > 1) allocate(snl_t_mpi(iba(ik),mp))
       allocate(snl_t(np_g1k(ik),mp)); snl_t = 0
       if(nrank_e > 1) then
!!$          allocate(snl_t_mpi(maxval(np_g1k),mp)); snl_t_mpi = 0
          allocate(snl_t_mpi(np_g1k(ik),mp)); snl_t_mpi = 0
       endif
!!$ASASASASAS

!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       iksnl = (ik-1)/nspin + 1

       do il1 = 1, mil
          snl_t = 0.d0
          do n = n1, n2
             do it = 1, ntyp
                if(il1 > nil(it)) cycle
                if(flag_radr(it,il1)) then
                   r = radr_p(n,ip_radr(it))
                   do ig = 1, np_g1k(ik)
                      wka(ig) = vlength(ig)*r
                   end do
                   call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                end if

                w = fac * wos_p(n,ip_radr(it)) * radr_p(n,ip_radr(it))
                do tau1 = 1, nt(it,il1)
                   facr = w * paor(n,il1,tau1,it)
                   ip = tu2p(tau1,it,il1)
                   do ig = 1, np_g1k(ik)
                      snl_t(ig,ip) = snl_t(ig,ip) + facr*wkb(ig)
                   end do
                end do
             end do
          end do
          if(nrank_e > 1) then
             call mpi_allreduce(snl_t,snl_t_mpi,np_g1k(ik)*np(il1),mpi_double_precision,mpi_sum,mpi_kg_world,ierr)
             snl_t = snl_t_mpi
          end if

          do im1 = 1, 2*(il1-1)+1
             nspher = (il1-1)**2 + im1
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             do p = 1, np(il1)
                ip = pm2lmtt1(p,im1,il1)
                do ig = 1, np_g1k(ik)
                   paog(ig,ip,iksnl) = snl_t(ig,p)*ylm(ig)
                end do
             end do
          end do
       end do
       if(nrank_e > 1) deallocate(snl_t_mpi)
       deallocate(snl_t)
    end do

    if(n1 <= n2) deallocate(wos_p,radr_p)
    deallocate(ip_radr,flag_radr)
    call dealloc_index_arrays()

#else
    if(nrank_e > 1) then
       allocate(snl1(maxval(np_g1k),nlmtt_pao)); snl1 = 0.d0
       allocate(snl2_mpi(maxval(np_g1k),nlmtt_pao))
!!$ASASASASAS
        snl2_mpi = 0.d0
!!$ASASASASAS
    end if

    do ik = 1, kv3, nspin
       if(map_k(ik) /= myrank_k) cycle                     ! MPI

!       call k_plus_G_vectors_3D(ik,kgp,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
!            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       call k_plus_G_vectors_3D(ik,kg,kg1,kv3,iba,nbase,vkxyz,ngabc,rltv&
            &,qx,qy,qz,vlength)                            ! ->(bottom_Subr.)
       iksnl = (ik-1)/nspin + 1

       do it=1,ntyp

          n = ceiling(dble(nmesh(it))/nrank_e)
          if(n == 0) n = 1
          n1 = n*myrank_e + 1
          n2 = n*(myrank_e+1)
          if(n2 > nmesh(it)) n2 = nmesh(it)

          call new_radr_and_wos(ik,it)                 ! --> radr, wos
          do lmt1 = 1,ilmt_pao(it)
             call m_PP_tell_lmtt_l_m_tau_pao(lmt1,it,lmtt1,il1,im1,tau1,nspher)
             call sphr(np_g1k(ik),nspher,qx,qy,qz,ylm)        ! -(bottom_Subr.)
             if(ipri >= 2) call wd_lmt_l_m_tau_etc &
                                (nfout,it,lmt1,il1,im1,tau1,lmtt1,nspher)
             snl2 = 0.d0

             do n = n1, n2
                facr = fac*wos(n)*radr(n)*paor(n,il1,tau1,it)
                do ig = 1, np_g1k(ik)
                   wka(ig) = vlength(ig)*radr(n)
                end do
                call dsjnv(il1-1,np_g1k(ik),wka,wkb)     ! -(bottom_Subr.)
                do ig = 1, np_g1k(ik)
                   snl2(ig) = snl2(ig) + facr*wkb(ig)
                end do
             end do

             if(nrank_e == 1) then
                do ig = 1, np_g1k(ik)
                   paog(ig,lmtt1,iksnl) = snl2(ig)*ylm(ig)
                end do
             else
                do ig = 1, np_g1k(ik)
                   snl1(ig,lmtt1) = snl2(ig)*ylm(ig)
                end do
             end if
          end do
       end do

       if(nrank_e > 1) then
          call mpi_allreduce(snl1, snl2_mpi,maxval(np_g1k)*nlmtt_pao,mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
          do lmtt1 = 1, nlmtt_pao
             do ig = 1, np_g1k(ik)
                paog(ig,lmtt1,iksnl) = snl2_mpi(ig,lmtt1)
             end do
          end do
       end if
    end do
    if(nrank_e > 1) deallocate(snl2_mpi,snl1)
#endif
    if(ipripao >= 2) call wd_paog
    call innerPr_deallocate_3D
    call tstatc0_end(id_sname)

  contains
#ifndef _mNLP_no_loop_exchange_
    subroutine set_flag_radr_and_ip_radr()
      integer :: it, il, ip, it2, is, it1

      mradr_indp = 1
      ip_radr(1) = 1
      do it = 2, ntyp
         is = 0
         search_loop: do it2 = 1, mradr_indp
            ip = ip_radr(it2)
            if((nmesh(ip) == nmesh(it) .and. dabs(xh(ip)-xh(it)) < DELTA .and. &
              & dabs(rmax(ip)-rmax(it)) < DELTA)) then
               is = it2
               exit search_loop
            end if
         end do search_loop
         if(is <= 0 .or. is > mradr_indp) then
            mradr_indp = mradr_indp+1
            is = mradr_indp
         end if
         ip_radr(it) = is
      end do

      if(ipri >= 2) then
         write(nfout,'(" !mNLP : mradr_indp = ", i5)') mradr_indp
         do it = 1, ntyp
            write(nfout,'(" !mNLP : ip_radr(",i5,") = ",i5)') it, ip_radr(it)
         end do
      end if

      do il = 1, mil
         flag_radr(1:ntyp,il) = .true.
         it1 = 0
         do it = 1, ntyp
            if(il > nil(it)) cycle
            it1 = it1+1
            if(it1 == 1) then
               flag_radr(it,il) = .true.
            else if(it1 >= 2) then
               if(ip_radr(ip) /= ip_radr(it)) then
                  flag_radr(it,il) = .true.
               else
                  flag_radr(it,il) = .false.
               end if
            end if
            ip = it
         end do
      end do

      if(ipri >= 2) then
         do il = 1, mil
            do it = 1, ntyp
               write(nfout,'(" !mNLP : flag_radr(",i5,",",i5,") = ",l3)') it,il,flag_radr(it,il)
            end do
         end do
      end if
    end subroutine set_flag_radr_and_ip_radr

    subroutine set_index_arrays()
      integer :: it, lmt1, n, il1, im1

      allocate(nil(ntyp)); nil = 0
      allocate(nt(ntyp,lcmax)); allocate(tu2p(ntau,ntyp,lcmax)); allocate(np(lcmax))
      nt = 0; tu2p = 0
      mil = 0
      if(ipri>=2) write(nfout,'(" !mNLP:    it, lmt1: lmtt1, il1, im1, tau1, nspher")')
      do it=1,ntyp
         do lmt1 = 1, ilmt_pao(it)
            call m_PP_tell_lmtt_l_m_tau_pao(lmt1,it,lmtt1,il1,im1,tau1,nspher)
            if(ipri>=2) write(nfout,'(" !mNLP: ",2i5," : ",5i5)') it,lmt1, lmtt1,il1,im1,tau1,nspher
            if(mil < il1) mil = il1
            if(nil(it) < il1) nil(it) = il1
            if(nt(it,il1) < tau1) nt(it,il1) = tau1
         end do
      end do

      do il1=1,mil
         n = 0
         np(il1) = 0
         do it=1,ntyp
            if(il1 > nil(it)) cycle
            np(il1) = np(il1)+nt(it,il1)
            do tau1=1,nt(it,il1)
               n = n+1
               tu2p(tau1,it,il1)= n
            end do
         end do
      end do

      mp = maxval(np(1:mil))
      allocate(pm2lmtt1(mp,2*(mil-1)+1,mil))
!!$ASASASASAS
      pm2lmtt1 = 0
!!$ASASASASAS
      do it = 1, ntyp
         do lmt1=1,ilmt_pao(it)
            call m_PP_tell_lmtt_l_m_tau_pao(lmt1,it,lmtt1,il1,im1,tau1,nspher)
            n = tu2p(tau1,it,il1)
            pm2lmtt1(n,im1,il1) = lmtt1
         end do
      end do

      if(ipri>=2) then
         write(nfout,'(" !mNLP: mil = ",i5)') mil
         do it = 1, ntyp
            write(nfout,'(" !mNLP: nil(",i3,") = ",i5)') it,nil(it)
         end do
         do it = 1, ntyp
            write(nfout,'(" !mNLP: nt(",i3,", : ) = ",5i5)') it,(nt(it,il1),il1=1,nil(it))
         end do

         write(nfout,'(" !mNLP: mp = ",i5)') mp
         do il1 = 1, mil
            write(nfout,'(" !mNLP: np(",i3,")=",i5)') il1, np(il1)
         end do
         do il1 = 1, mil
            do it = 1, ntyp
               if(il1 > nil(it)) cycle
               do tau1 = 1, nt(it,il1)
                  write(nfout,'(" !mNLP: tu2p(",i3,",",i3,",",i3,") = ",i5)') tau1,it,il1,tu2p(tau1,it,il1)
               end do
            end do
         end do
         do il1 = 1, mil
            do im1 = 1, 2*(il1-1)+1
               do n = 1, np(il1)
                  write(nfout,'(" !mNLP: il1 = ",i3," im1 = ",i3," n = ", i3, " pm2lmtt1 = ",i5)') &
                       & il1,im1, n,pm2lmtt1(n,im1,il1)
               end do
            end do
         end do
      end if
    end subroutine set_index_arrays

    subroutine dealloc_index_arrays()
      deallocate(pm2lmtt1,np,tu2p,nt,nil)
    end subroutine dealloc_index_arrays
#endif

    subroutine wd_paog
      integer :: i, ilmtt, ik, j, iksnl
      write(nfout,'(" << m_NLP_poor_dot_PWs.wd_paog >>")')
      write(nfout,'(10("(",3i2,")"))') ((ngabc(i,j),j=1,3),i=1,30)
      do ik = 1, kv3, nspin
         if(map_k(ik) /= myrank_k) cycle        ! MPI
         iksnl = (ik-1)/nspin + 1
         write(nfout,'(" ik = ",i5)') iksnl
         write(nfout,'(" nbase(1:8,",i5,")",8i10)') ik,(nbase(i,ik),i=1,8)
         do ilmtt = 1, nlmtt_pao
            write(nfout,'("(ilmtt = ",i5,")",8f10.5,99(/15x,8f10.5))') ilmtt,(paog(i,ilmtt,iksnl),i=1,kg1)
         end do
      end do
    end subroutine wd_paog
  end subroutine m_NLP_paor_dot_PWs_3D
!------
  subroutine m_NLP_wd_snl_3D(nfout,nfcntn_bin,F_CNTN_BIN_partitioned,kv3)
   use m_Parallelization,     only : mpi_ke_world, mpi_kg_world

    integer, intent(in) :: nfout,nfcntn_bin
    logical, intent(in) :: F_CNTN_BIN_partitioned
    integer, intent(in) :: kv3

    integer                                :: i,ik,iksnl,p,q,iend  ! MPI
    integer                                :: i1, i2
    real(kind=DP), allocatable, dimension(:,:) :: snl_wk  ! MPI
    real(kind=DP), allocatable, dimension(:,:) :: snl_mpi  ! MPI
    integer             :: id_sname = -1
#ifdef __TIMER_SUB__
  call timer_sta(1369)
#endif

    call tstatc0_begin('m_NLP_wd_snl ',id_sname)

!f    allocate(snl_wk(kg1,nlmtt))                      ! MPI
!!$ASASASASAS
!f    snl_wk = 0.d0
!!$ASASASASAS
    if(istress == 0) then
       iend = 0
    else 
       iend = 9
    end if
    do i = 0, iend
       if(F_CNTN_BIN_partitioned) then
          allocate(snl_wk(maxval(np_g1k),nlmtt))                      ! MPI
          snl_wk = 0.d0
          do iksnl = ista_snl, iend_snl
             if(i==0) then
                snl_wk = snl(:,:,iksnl)
             else
                p = mod(i,3); if(p == 0) p = 3
                q = (i-p)/3 + 1
                snl_wk = snld(:,:,iksnl,p,q)
             end if
#ifdef __TIMER_IODO__
  call timer_sta(1399)
#endif
             write(nfcntn_bin) snl_wk
#ifdef __TIMER_IODO__
  call timer_end(1399)
#endif
          end do
          deallocate(snl_wk)
       else
          allocate(snl_wk(kg1,nlmtt))                      ! MPI
          allocate(snl_mpi(kg1,nlmtt))                      ! MPI
          snl_wk = 0.d0
          do ik = 1, kv3, nspin                             ! MPI
             if(iprisnl >= 2) write(nfout,'(" ! ik = ",i5," <<m_NLP_wd_snl>>")') ik
             iksnl = (ik-1)/nspin + 1                       ! MPI
             if(map_k(ik) == myrank_k .and. myrank_e == 0 ) then  ! MPI
                if(i==0) then
!f                   snl_wk = snl(:,:,iksnl)                  ! MPI
#ifdef __TIMER_IODO__
  call timer_sta(1400)
#endif
                   do i2 = 1, nlmtt
                     do i1 = ista_g1k(ik), iend_g1k(ik)
                       snl_wk(i1,i2) = snl(i1-ista_g1k(ik)+1,i2,iksnl)                  ! MPI
                   enddo; enddo
#ifdef __TIMER_IODO__
  call timer_end(1400)
#endif
#ifdef __TIMER_IOCOMM__
  call timer_barrier(mpi_ke_world)
  call timer_sta(1401)
#endif
                   call mpi_allreduce(snl_wk,snl_mpi,kg1*nlmtt, &
                       &mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
#ifdef __TIMER_IOCOMM__
  call timer_end(1401)
#endif
                else
                   p = mod(i,3); if(p == 0) p = 3           ! MPI
                   q = (i-p)/3 + 1                          ! MPI
!f                   snl_wk = snld(:,:,iksnl,p,q)             ! MPI
#ifdef __TIMER_IODO__
  call timer_sta(1402)
#endif
                   do i2 = 1, nlmtt
                     do i1 = ista_g1k(ik), iend_g1k(ik)
                       snl_wk(i1,i2) = snld(i1-ista_g1k(ik)+1,i2,iksnl,p,q)                  ! MPI
                   enddo; enddo
#ifdef __TIMER_IODO__
  call timer_end(1402)
#endif
#ifdef __TIMER_IOCOMM__
  call timer_barrier(mpi_ke_world)
  call timer_sta(1403)
#endif
                   call mpi_allreduce(snl_wk,snl_mpi,kg1*nlmtt, &
                       &mpi_double_precision,mpi_sum,mpi_ke_world,ierr)
#ifdef __TIMER_IOCOMM__
  call timer_end(1403)
#endif
                end if
!f                if(map_k(ik) /= 0) call mpi_send(snl_mpi,kg1*nlmtt,mpi_double_precision &
!f                     &         ,0,1,mpi_comm_group,ierr)    ! MPI
!f             else if(mype == 0 .and. map_k(ik) /= 0) then   ! MPI
!f                call mpi_recv(snl_wk,kg1*nlmtt,mpi_double_precision,map_k(ik)*nrank_e &
!f                     &            ,1,mpi_comm_group,istatus,ierr) !MPI
             end if                                         ! MPI
!f             if(mype == 0) write(nfcntn_bin) snl_wk         ! MPI
#ifdef __TIMER_IODO__
  call timer_sta(1404)
#endif
             if(mype == 0) write(nfcntn_bin) snl_mpi         ! MPI
#ifdef __TIMER_IODO__
  call timer_end(1404)
#endif
          end do
          deallocate(snl_wk)
          deallocate(snl_mpi)
       end if
    end do
    if(iprisnl >= 2) then
       write(nfout,'(" ! snl is written (m_NLP_wd_snl)")')
       if(F_CNTN_BIN_partitioned) then
          write(nfout,'(" ! snl size = ",i9)') kg1*nlmtt*(iend_snl-ista_snl+1)*8
       else
          write(nfout,'(" ! snl size = ",i9)') kg1*nlmtt*kv3*8
       end if
    end if
!f    deallocate(snl_wk)
    call tstatc0_end(id_sname)
#ifdef __TIMER_SUB__
  call timer_end(1369)
#endif
  end subroutine m_NLP_wd_snl_3D

  subroutine m_NLP_rd_snl_3D(nfout,nfcntn_bin,F_CNTN_BIN_partitioned,kv3)
   use m_Parallelization,     only : mpi_ke_world, mpi_kg_world

    integer, intent(in) :: nfout,nfcntn_bin
    logical, intent(in) :: F_CNTN_BIN_partitioned
    integer, intent(in) :: kv3

    integer                                :: i,ik,iksnl,p,q,iend  ! MPI
    integer                                :: i1, i2
    real(kind=DP), allocatable, dimension(:,:) :: snl_wk  ! MPI
    integer             :: id_sname = -1
#ifdef __TIMER_SUB__
  call timer_sta(1368)
#endif

    call tstatc0_begin('m_NLP_rd_snl ',id_sname)
!f    allocate(snl_wk(kg1,nlmtt))                      ! MPI
!!$ASASASASAS
!f    snl_wk = 0.d0
!!$ASASASASAS
    if(iprisnl >= 2) write(nfout,'(" kg1 = ",i6, " nlmtt = ",i6)') kg1,nlmtt
    if(iprisnl >= 2) write(nfout,'(" ! nfcntn_bin = ", i6)') nfcntn_bin
    if(istress == 0) then
       iend = 0
    else 
       iend = 9
    end if
    do i = 0, iend                                       ! MPI
       if(F_CNTN_BIN_partitioned) then
          allocate(snl_wk(maxval(np_g1k),nlmtt))                      ! MPI
          snl_wk = 0.d0
#ifdef __TIMER_IODO__
  call timer_sta(1395)
#endif
          do iksnl = ista_snl, iend_snl
             if(iprisnl >= 2) write(nfout,'(" iksnl = ",i6, " <<m_NLP_rd_snl>>")') iksnl
             read(nfcntn_bin) snl_wk
             if(i==0) then
                snl(:,:,iksnl) = snl_wk
             else
                p = mod(i,3); if(p == 0) p = 3           ! MPI
                q = (i-p)/3 + 1                          ! MPI
                snld(:,:,iksnl,p,q) = snl_wk             ! MPI
             end if
          end do
#ifdef __TIMER_IODO__
  call timer_end(1395)
#endif
          deallocate(snl_wk)
       else
          allocate(snl_wk(kg1,nlmtt))                      ! MPI
          snl_wk = 0.d0
          do ik = 1, kv3, nspin                             ! MPI
             if(iprisnl >= 2) write(nfout,'(" ik = ",i6, " <<m_NLP_rd_snl>>")') ik
             iksnl = (ik-1)/nspin + 1                       ! MPI
#ifdef __TIMER_IODO__
  call timer_sta(1396)
#endif
             if(mype == 0) read(nfcntn_bin) snl_wk          ! MPI
#ifdef __TIMER_IODO__
  call timer_end(1396)
#endif
             if(iprisnl >= 2) write(nfout,'(" snl_wk is read")')
!f             if(mype == 0 .and. map_k(ik) /= 0) then        ! MPI
!f                call mpi_send(snl_wk,kg1*nlmtt,mpi_double_precision,map_k(ik)*nrank_e,1 &
!f                     &            , mpi_comm_group,ierr)       ! MPI
!f             else if(map_k(ik) /= 0 .and. map_k(ik) == myrank_k .and. myrank_e == 0) then ! MPI
!f                call mpi_recv(snl_wk,kg1*nlmtt,mpi_double_precision,0,1 &
!f                     &            , mpi_comm_group,istatus,ierr) ! MPI
!f             end if
#ifdef __TIMER_IOCOMM__
  call timer_barrier(mpi_comm_group)
  call timer_sta(1397)
#endif
             call mpi_bcast(snl_wk,kg1*nlmtt &
               & ,mpi_double_precision,0,mpi_comm_group,ierr)
#ifdef __TIMER_IOCOMM__
  call timer_end(1397)
#endif

             if(map_k(ik) == myrank_k) then                 ! MPI
!f                call mpi_bcast(snl_wk,kg1*nlmtt,mpi_double_precision,0 &
!f                     &              ,mpi_k_world(myrank_k),ierr) ! MPI
#ifdef __TIMER_IODO__
  call timer_sta(1398)
#endif
                if(i==0) then
!f                   snl(:,:,iksnl) = snl_wk                  ! MPI
                   do i2 = 1, nlmtt
                     do i1 = ista_g1k(ik), iend_g1k(ik)
                       snl(i1-ista_g1k(ik)+1,i2,iksnl) = snl_wk(i1,i2)
                   enddo; enddo
                else
                   p = mod(i,3); if(p == 0) p = 3           ! MPI
                   q = (i-p)/3 + 1                          ! MPI
!f                   snld(:,:,iksnl,p,q) = snl_wk             ! MPI
                   do i2 = 1, nlmtt
                     do i1 = ista_g1k(ik), iend_g1k(ik)
                       snld(i1-ista_g1k(ik)+1,i2,iksnl,p,q) = snl_wk(i1,i2)
                   enddo; enddo
                end if
#ifdef __TIMER_IODO__
  call timer_end(1398)
#endif
             end if
          end do
          deallocate(snl_wk)
       end if
    end do
!f    deallocate(snl_wk)
    call tstatc0_end(id_sname)
#ifdef __TIMER_SUB__
  call timer_end(1368)
#endif
  end subroutine m_NLP_rd_snl_3D
! === For epsmain by tkato 2013/11/14 ==========================================
  subroutine m_NLP_epsmain_reallocate()
    if(allocated(snl)) deallocate(snl)
    allocate(snl(maxval(np_g1k),nlmtt,ista_snl:iend_snl)); snl = 0.d0
    if(allocated(snl_add)) deallocate(snl_add)
    allocate(snl_add(maxval(np_g1k),nlmtt_add,ista_snl:iend_snl)); snl_add = 0.d0
  end subroutine m_NLP_epsmain_reallocate
! ==============================================================================

end module m_NonLocal_Potential
