      SUBROUTINE PDLAED3( ICTXT, K, N, NB, D, DROW, DCOL, RHO, DLAMDA,
     $                    W, Z, U, LDU, BUF, INDX, INDCOL, INDROW,
     $                    INDXR, INDXC, CTOT, NPCOL, INFO )
      USE OMP_LIB
*
*  -- ScaLAPACK auxiliary routine (version 1.7) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     December 31, 1998
*
*     .. Scalar Arguments ..
      INTEGER            DCOL, DROW, ICTXT, INFO, K, LDU, N, NB, NPCOL
      DOUBLE PRECISION   RHO
*     ..
*     .. Array Arguments ..
      INTEGER            CTOT( 0: NPCOL-1, 4 ), INDCOL( * ),
     $                   INDROW( * ), INDX( * ), INDXC( * ), INDXR( * )
      DOUBLE PRECISION   BUF( 1:3*K ), D( * ), DLAMDA( * ), U( LDU, * ),
!      DOUBLE PRECISION   BUF( * ), D( * ), DLAMDA( * ), U( LDU, * ),
     $                   W( * ), Z(1:K)
!     $                   W( * ), Z( * )
*     ..
*
*  Purpose
*  =======
*
*  PDLAED3_X finds the roots of the secular equation, as defined by the
*  values in D, W, and RHO, between 1 and K.  It makes the
*  appropriate calls to SLAED4
*
*  This code makes very mild assumptions about floating point
*  arithmetic. It will work on machines with a guard digit in
*  add/subtract, or on those binary machines without guard digits
*  which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
*  It could conceivably fail on hexadecimal or decimal machines
*  without guard digits, but we know of none.
*
*  Arguments
*  =========
*
*  ICTXT  (global input) INTEGER
*         The BLACS context handle, indicating the global context of
*         the operation on the matrix. The context itself is global.
*
*  K      (output) INTEGER
*         The number of non-deflated eigenvalues, and the order of the
*         related secular equation. 0 <= K <=N.
*
*  N      (input) INTEGER
*         The dimension of the symmetric tridiagonal matrix.  N >= 0.
*
*  NB      (global input) INTEGER
*          The blocking factor used to distribute the columns of the
*          matrix. NB >= 1.
*
*  D      (input/output) DOUBLE PRECISION array, dimension (N)
*         On entry, D contains the eigenvalues of the two submatrices to
*         be combined.
*         On exit, D contains the trailing (N-K) updated eigenvalues
*         (those which were deflated) sorted into increasing order.
*
*  DROW   (global input) INTEGER
*          The process row over which the first row of the matrix D is
*          distributed. 0 <= DROW < NPROW.
*
*  DCOL   (global input) INTEGER
*          The process column over which the first column of the
*          matrix D is distributed. 0 <= DCOL < NPCOL.
*
*  Q      (input/output) DOUBLE PRECISION array, dimension (LDQ, N)
*         On entry, Q contains the eigenvectors of two submatrices in
*         the two square blocks with corners at (1,1), (N1,N1)
*         and (N1+1, N1+1), (N,N).
*         On exit, Q contains the trailing (N-K) updated eigenvectors
*         (those which were deflated) in its last N-K columns.
*
*  LDQ    (input) INTEGER
*         The leading dimension of the array Q.  LDQ >= max(1,NQ).
*
*  RHO    (global input/output) DOUBLE PRECISION
*         On entry, the off-diagonal element associated with the rank-1
*         cut which originally split the two submatrices which are now
*         being recombined.
*         On exit, RHO has been modified to the value required by
*         PDLAED3_X.
*
*  DLAMDA (global output) DOUBLE PRECISION array, dimension (N)
*         A copy of the first K eigenvalues which will be used by
*         SLAED3 to form the secular equation.
*
*  W      (global output) DOUBLE PRECISION array, dimension (N)
*         The first k values of the final deflation-altered z-vector
*         which will be passed to SLAED3.
*
*  Z      (global input) DOUBLE PRECISION array, dimension (N)
*         On entry, Z contains the updating vector (the last
*         row of the first sub-eigenvector matrix and the first row of
*         the second sub-eigenvector matrix).
*         On exit, the contents of Z have been destroyed by the updating
*         process.
*
*  U     (global output) DOUBLE PRECISION array
*         global dimension (N, N), local dimension (LDU, NQ).
*         Q  contains the orthonormal eigenvectors of the symmetric
*         tridiagonal matrix.
*
*  LDU    (input) INTEGER
*         The leading dimension of the array U.
*
*  QBUF   (workspace) DOUBLE PRECISION array, dimension 3*N
*
*
*  INDX   (workspace) INTEGER array, dimension (N)
*         The permutation used to sort the contents of DLAMDA into
*         ascending order.
*
*  INDCOL (workspace) INTEGER array, dimension (N)
*
*
*  INDROW (workspace) INTEGER array, dimension (N)
*
*
*  INDXR (workspace) INTEGER array, dimension (N)
*
*
*  INDXC (workspace) INTEGER array, dimension (N)
*
*  CTOT   (workspace) INTEGER array, dimension( NPCOL, 4)
*
*  NPCOL   (global input) INTEGER
*          The total number of columns over which the distributed
*           submatrix is distributed.
*
*  INFO   (output) INTEGER
*          = 0:  successful exit.
*          < 0:  if INFO = -i, the i-th argument had an illegal value.
*          > 0:  The algorithm failed to compute the ith eigenvalue.
*
*  =====================================================================
*
*     .. Parameters ..
      DOUBLE PRECISION   ONE
      PARAMETER          ( ONE = 1.0D+0 )
*     ..
*     .. Local Scalars ..
      INTEGER            COL, GI, I, IINFO, IIU, IPD, IU, J, JJU, JU,
     $                   KK, KL, KLC, KLR, MYCOL, MYKL, MYKLR, MYROW,
     $                   NPROW, PDC, PDR, ROW
      DOUBLE PRECISION   AUX, TEMP
      REAL(8) :: SZ(1:K), SBUF(1:K)
      INTEGER :: SINFO
*     ..
*     .. External Functions ..
      INTEGER            INDXG2L
      DOUBLE PRECISION   DLAMC3, DNRM2
      EXTERNAL           INDXG2L, DLAMC3, DNRM2
*     ..
*     .. External Subroutines ..
      EXTERNAL           BLACS_GRIDINFO, DCOPY, DGEBR2D, DGEBS2D,
     $                   DGERV2D, DGESD2D, DLAED4
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          MOD, SIGN, SQRT
*     ..
      include 'mpif.h'
      include 'trd.h'
*     .. Executable Statements ..
*
*     Test the input parameters.
*
      INFO = 0
*
*     Quick return if possible
*
      IF( K.EQ.0 )
     $   RETURN
*
      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
*
      ROW = DROW
      COL = DCOL
!$OMP PARALLEL DO PRIVATE(I,J,ROW,COL)
      DO 20 I = 1, N, NB
         ROW = MOD(DROW+(I-1)/NB, NPROW)
         COL = MOD(DCOL+(I-1)/NB, NPCOL)
         DO 10 J = 0, NB - 1
            IF( I+J.LE.N ) THEN
               INDROW( I+J ) = ROW
               INDCOL( I+J ) = COL
            END IF
   10    CONTINUE
!         ROW = MOD( ROW+1, NPROW )
!         COL = MOD( COL+1, NPCOL )
   20 CONTINUE
!$OMP END PARALLEL DO
         ROW = MOD(DROW+(NB-1)/NB+1, NPROW)
         COL = MOD(DCOL+(NB-1)/NB+1, NPCOL)
*
      MYKL = CTOT( MYCOL, 1 ) + CTOT( MYCOL, 2 ) + CTOT( MYCOL, 3 )
      KLR = MYKL / NPROW
      IF( MYROW.EQ.DROW ) THEN
         MYKLR = KLR + MOD( MYKL, NPROW )
      ELSE
         MYKLR = KLR
      END IF
      PDC = 1
      COL = DCOL
   30 CONTINUE
      IF( MYCOL.NE.COL ) THEN
         PDC = PDC + CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 )
         COL = MOD( COL+1, NPCOL )
         GO TO 30
      END IF
      PDR = PDC
      KL = KLR + MOD( MYKL, NPROW )
      ROW = DROW
   40 CONTINUE
      IF( MYROW.NE.ROW ) THEN
         PDR = PDR + KL
         KL = KLR
         ROW = MOD( ROW+1, NPROW )
         GO TO 40
      END IF
*
!$OMP PARALLEL DO
      DO I = 1, K
         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
         Z( I ) = ONE
      END DO
!$OMP END PARALLEL DO

      IF( MYKLR.GT.0 ) THEN
!-         KK = PDR
         SINFO = INFO
!$OMP PARALLEL PRIVATE(I,J,KK,IINFO,SZ,SBUF,SINFO,AUX)

         SZ( 1:K ) = ONE
!$OMP DO
         DO 80 I = 1, MYKLR
            KK = PDR + I - 1
            CALL DLAED4( K, KK, DLAMDA, W, SBUF, RHO, AUX, IINFO )
            BUF(K+I) = AUX
            IF( IINFO.NE.0 ) THEN
               SINFO = KK
            END IF
*
*     ..Compute part of z
*
            DO 60 J = 1, KK - 1
               SZ( J ) = SZ( J )*( SBUF( J ) /
     $                  ( DLAMDA( J )-DLAMDA( KK ) ) )
   60       CONTINUE
            SZ( KK ) = SZ( KK )*SBUF( KK )
            DO 70 J = KK + 1, K
               SZ( J ) = SZ( J )*( SBUF( J ) /
     $                  ( DLAMDA( J )-DLAMDA( KK ) ) )
   70       CONTINUE
!            KK = KK + 1
   80    CONTINUE
!$OMP ENDDO

!$OMP BARRIER

!$OMP MASTER
         Z(1:K) = SZ(1:K)
         INFO = SINFO
!$OMP END MASTER

!$OMP BARRIER

         DO I=1,OMP_GET_NUM_THREADS()-1
            IF ( OMP_GET_THREAD_NUM() == I ) THEN
               DO J=1,K
                  Z(J) = Z(J) * SZ(J)
               END DO
               INFO = MAX(INFO, SINFO)
            END IF
!$OMP BARRIER
         END DO

!$OMP END PARALLEL
      ELSE

         Z(1:K) = ONE

      END IF
*
      call MPI_ALLREDUCE( Z, BUF, K, MPI_DOUBLE_PRECISION,
     $                    MPI_PROD, TRD_COMM_WORLD, I )

!$OMP PARALLEL DO
      DO I = 1, K
         Z( I ) = SIGN( SQRT( -BUF( I ) ), W( I ) )
      END DO
!$OMP END PARALLEL DO
      
      IF( MYKLR.GT.0 ) THEN
         IF( MYROW.NE.DROW ) THEN
            call send_dbl( BUF( K+1 ), MYKLR,
     $                     DROW+NPROW*MYCOL+1, TRD_COMM_WORLD )
         ELSE
            IPD = 2*K + 1
            CALL DCOPY( MYKLR, BUF( K+1 ), 1, BUF( IPD ), 1 )
            IF( KLR.GT.0 ) THEN
               IPD = MYKLR + IPD
               ROW = MOD( DROW+1, NPROW )
               DO I = 1, NPROW - 1
                  call recv_dbl( BUF( IPD ), KLR,
     $                     ROW+NPROW*MYCOL+1, TRD_COMM_WORLD )
                  IPD = IPD + KLR
                  ROW = MOD( ROW+1, NPROW )
               END DO
            END IF
         END IF
      END IF
*
      IF( MYROW.EQ.DROW ) THEN
         IF( MYCOL.NE.DCOL .AND. MYKL.NE.0 ) THEN
            CALL DCOPY( MYKL, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 )
            call send_dbl( BUF( K+1 ), MYKL,
     $                     MYROW+NPROW*DCOL+1, TRD_COMM_WORLD )
         ELSE IF( MYCOL.EQ.DCOL ) THEN
            IPD = 2*K + 1
            COL = DCOL
            KL = MYKL
            DO I = 1, NPCOL - 1
               IPD = IPD + KL
               COL = MOD( COL+1, NPCOL )
               KL = CTOT( COL, 1 ) + CTOT( COL, 2 ) + CTOT( COL, 3 )
               IF( KL.NE.0 ) THEN
                  call recv_dbl( BUF( IPD ), KL,
     $                     MYROW+NPROW*COL+1, TRD_COMM_WORLD )
               END IF
            END DO
            CALL DCOPY( K, BUF( 2*K+1 ), 1, BUF( K+1 ), 1 )
         END IF
      END IF
*
*     Diffusion
*
      CALL bcast_dbl( BUF(K+1), K, DROW+NPROW*DCOL+1, TRD_COMM_WORLD )
*
*     Copy of D at the good place
*
      KLC = 0
      KLR = 0
      DO 140 I = 1, K
         GI = INDX( I )
         D( GI ) = BUF( K+I )
         COL = INDCOL( GI )
         ROW = INDROW( GI )
         IF( COL.EQ.MYCOL ) THEN
            KLC = KLC + 1
            INDXC( KLC ) = I
         END IF
         IF( ROW.EQ.MYROW ) THEN
            KLR = KLR + 1
            INDXR( KLR ) = I
         END IF
  140 CONTINUE
*
*     Compute eigenvectors of the modified rank-1 modification.
*
      IF( MYKL.NE.0 ) THEN
!$OMP PARALLEL DO PRIVATE(I,J,KK,IU,JU,IIU,JJU,BUF,AUX,TEMP,IINFO)
!$+ REDUCTION(MAX:INFO)
         DO 180 J = 1, MYKL
            KK = INDXC( J )
            JU = INDX( KK )
            JJU = INDXG2L( JU, NB, J, J, NPCOL )
            CALL DLAED4( K, KK, DLAMDA, W, BUF, RHO, AUX, IINFO )
            IF( IINFO.NE.0 ) THEN
               INFO = KK
            END IF
            IF( K.EQ.1 .OR. K.EQ.2 ) THEN
               DO 150 I = 1, KLR
                  KK = INDXR( I )
                  IU = INDX( KK )
                  IIU = INDXG2L( IU, NB, J, J, NPROW )
                  U( IIU, JJU ) = BUF( KK )
  150          CONTINUE
               GO TO 180
            END IF
*
            DO 160 I = 1, K
               BUF( I ) = Z( I ) / BUF( I )
  160       CONTINUE
            TEMP = DNRM2( K, BUF, 1 )
            DO 170 I = 1, KLR
               KK = INDXR( I )
               IU = INDX( KK )
               IIU = INDXG2L( IU, NB, J, J, NPROW )
               U( IIU, JJU ) = BUF( KK ) / TEMP
  170       CONTINUE
*
  180    CONTINUE
!$OMP END PARALLEL DO
      END IF
*
  190 CONTINUE
*
      RETURN
*
*     End of PDLAED3_X
*
      END
