200 lines
10 KiB
Plaintext
200 lines
10 KiB
Plaintext
|
|
|||
|
_OPTIMIZING IN A PARALLEL ENVIRONMENT_
|
|||
|
by Barr E. Bauer
|
|||
|
|
|||
|
[LISTIN<49> ONE]
|
|||
|
|
|||
|
program test 1
|
|||
|
* 2
|
|||
|
* purpose is to test SGI parallelization scheme for loop selection, 3
|
|||
|
* numerically-intensive calculations, and total reduction. See text 4
|
|||
|
* for details. 5
|
|||
|
* 6
|
|||
|
parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10) 7
|
|||
|
real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST) 8
|
|||
|
real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST) 9
|
|||
|
real*8 sub_total(MAXFIRST), partial_total(4) 10
|
|||
|
real*8 d(MAXTHIRD), c, tmp ! local variables 11
|
|||
|
real*8 dist(MAXSECOND,MAXFIRST), grand_total 12
|
|||
|
real*8 grand_total ! test for proper operation 13
|
|||
|
logical parallel ! selects 2-version loops 14
|
|||
|
integer*4 iflag ! used to show LASTLOCAL value 15
|
|||
|
16
|
|||
|
data parallel /.false./ 17
|
|||
|
data sub_total, iflag /MAXFIRST*0.0, 0/ 18
|
|||
|
* 19
|
|||
|
* outer loop: contains both interior loops 20
|
|||
|
* 21
|
|||
|
22
|
|||
|
* C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist), 23
|
|||
|
* C$& lastlocal(iflag) 24
|
|||
|
25
|
|||
|
do i = 1, MAXFIRST 26
|
|||
|
* 27
|
|||
|
* first inner loop: fills arrays a and b 28
|
|||
|
* 29
|
|||
|
30
|
|||
|
* C$doacross local(j,k,c), share(i,a,b) 31
|
|||
|
32
|
|||
|
do j = 1, MAXSECOND 33
|
|||
|
do k = 1, MAXTHIRD 34
|
|||
|
a(k,j,i) = dsqrt(dfloat(i*j*k)) 35
|
|||
|
c = 1.0 - a(k,j,i) 36
|
|||
|
if (c .le. 0.0 .and. i .lt. j*k) then 37
|
|||
|
c = -c 38
|
|||
|
else 39
|
|||
|
c = c**2 40
|
|||
|
endif 41
|
|||
|
b(k,j,i) = 32*(dcos(c)**5)*dsin(c)- 42
|
|||
|
1 32*(dcos(c)**3)*dsin(c)+ 43
|
|||
|
2 6*dcos(c)*dsin(c) 44
|
|||
|
enddo 45
|
|||
|
enddo 46
|
|||
|
* 47
|
|||
|
* seond inner loop: determines distance and starts summation 48
|
|||
|
* 49
|
|||
|
50
|
|||
|
* c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total), 51
|
|||
|
* c$& lastlocal(iflag) 52
|
|||
|
53
<0A> do j=1, MAXSECOND 54
|
|||
|
tmp = 0.0 55
|
|||
|
do k = 1, MAXTHIRD 56
|
|||
|
d(k) = a(k,j,i) - b(k,j,i) 57
|
|||
|
enddo 58
|
|||
|
do k = 1, MAXTHIRD 59
|
|||
|
tmp = tmp + d(k)**2 60
|
|||
|
enddo 61
|
|||
|
dist(j,i) = dsqrt(tmp) 62
|
|||
|
if (dist(j,i) .le. 0.1) iflag = iflag + 1 63
|
|||
|
sub_total(j) = sub_total(j) + dist(j,i) 64
|
|||
|
enddo 65
|
|||
|
enddo 66
|
|||
|
* 67
|
|||
|
* the next section is an example of sum reduction optimized to the 68
|
|||
|
* parallel environment and the use of a more efficient 2 loop summation 69
|
|||
|
* 70
|
|||
|
* if -mp option is active, parallel is set to .true. which then 71
|
|||
|
* selects the parallel version 72
|
|||
|
* 73
|
|||
|
74
|
|||
|
C$ parallel = .true. 75
|
|||
|
grand_total = 0.0 76
|
|||
|
if (parallel) then ! parallel version 77
|
|||
|
C$ num_threads = mp_numthreads() 78
|
|||
|
ichunk = (MAXFIRST + (num_threads - 1))/num_threads 79
|
|||
|
80
|
|||
|
C$doacross local(k,j), 81
|
|||
|
C$& share(num_threads,partial_total,sub_total,ichunk) 82
|
|||
|
83
|
|||
|
do k = 1, num_threads ! this loop is parallelized 84
|
|||
|
partial_total(k) = 0.0 85
|
|||
|
do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST) 86
|
|||
|
partial_total(k) = partial_total(k) + sub_total(j) 87
|
|||
|
enddo 88
|
|||
|
enddo 89
|
|||
|
do j = 1, num_threads ! smaller loop handled as scalar 90
|
|||
|
grand_total = grand_total + partial_total(j) 91
|
|||
|
enddo 92
|
|||
|
else ! the scalar version 93
|
|||
|
do j = 1, MAXFIRST 94
|
|||
|
grand_total = grand_total + sub_total(j) 95
|
|||
|
enddo 96
|
|||
|
endif 97
|
|||
|
98
|
|||
|
if (parallel) then 99
|
|||
|
C$ write (*,10) grand_total, num_threads 100
|
|||
|
C$ write (*,20) iflag 101
|
|||
|
else 102
|
|||
|
write (*,30) grand_total 103
|
|||
|
write (*,40) iflag 104
|
|||
|
endif 105
|
|||
|
stop 106
|
|||
|
C$10 format(1x,'grand total = ',g10.3,'threads = ',i4) 107
|
|||
|
C$20 format(1x,'parallel iflag = ',i10) 108
<0A>30 format(1x,'grand total = ',g10.3) 109
|
|||
|
40 format(1x,'scalar iflag = ',i10) 110
|
|||
|
end 111
|
|||
|
|
|||
|
|
|||
|
|
|||
|
[LISTIN<49> TWO]
|
|||
|
|
|||
|
|
|||
|
(source code)
|
|||
|
|
|||
|
subroutine example(a, b, c, n)
|
|||
|
integer*4 n
|
|||
|
real*4 a(n), b(n), c(n)
|
|||
|
|
|||
|
(additional code)
|
|||
|
|
|||
|
c$doacross local(i, x)
|
|||
|
do i=1, n
|
|||
|
x = a(n) * b(n)
|
|||
|
c(n) = x**2
|
|||
|
enddo
|
|||
|
|
|||
|
(additional code)
|
|||
|
|
|||
|
return
|
|||
|
end
|
|||
|
|
|||
|
(the loop is transformed to)
|
|||
|
|
|||
|
subroutine _example_1(
|
|||
|
1 _local_start, ! index starting value
<0A> 2 _local_ntrip, ! number of loop executions
|
|||
|
3 _incr, ! index increment
|
|||
|
4 _my_threadno) ! unique process ID number
|
|||
|
|
|||
|
integer*4 _local_start, _local_ntrip, _incr, _my_threadno
|
|||
|
|
|||
|
integer*4 i ! declared local
|
|||
|
real*4 x ! declared local
|
|||
|
|
|||
|
integer*4 _tmp ! created local
|
|||
|
|
|||
|
i = _local_start
|
|||
|
do _tmp = 1, _local_ntrip
|
|||
|
x = a(i) * b(i)
|
|||
|
c(i) = x**2
|
|||
|
i = i + _incr
|
|||
|
enddo
|
|||
|
return
|
|||
|
end
|
|||
|
|
|||
|
|
|||
|
Exampl<EFBFBD> 1<> A typical D<> loop
|
|||
|
|
|||
|
|
|||
|
do i = 1, n
|
|||
|
a(i) = x * b(i)
|
|||
|
enddo
|
|||
|
|
|||
|
|
|||
|
Exampl<EFBFBD> 2<> <20> D<> loo<6F> i<> whic<69> th<74> arra<72> variabl<62> reference<63> <20> <20>
|
|||
|
valu<EFBFBD> tha<68> i<> no<6E> curren<65> wit<69> <20>h<EFBFBD> index
|
|||
|
|
|||
|
do i = 2, n
|
|||
|
arr(i) = b(i) - arr(i-1)
|
|||
|
enddo
|
|||
|
|
|||
|
Exampl<EFBFBD> 3<> A<> exampl<70> o<> loa<6F> imbalance
|
|||
|
|
|||
|
do i = 1, n
|
|||
|
do j = 1, i
|
|||
|
a(j, i) = a(j, i) * xmult
|
|||
|
enddo
|
|||
|
enddo
|
|||
|
|
|||
|
|
|||
|
Exampl<EFBFBD> 4<> Loa<6F> balancing
|
|||
|
|
|||
|
num_threads = mp_numthreads()
|
|||
|
c$doacross local(i, j, k)
|
|||
|
do k = 1, num_threads
|
|||
|
do i = k, n, num_threads
|
|||
|
do j = 1, i
|
|||
|
a(j, i) = a(j, i) * xmult
|
|||
|
enddo
|
|||
|
enddo
|
|||
|
enddo
|
|||
|
|