3.4 CUDAによる並列化

3.4.1 CUDAによる並列化

CUDAによりNVIDIAのグラフィックボードで高速に計算することができます。(GPGPU)

リスト3-4-1にCUDA版のred-black SOR法の反復計算部分を示します。
blockは(k, j)の2次元の(16, 16)とし、gridは((Nz+1)/16, (Ny+1)/16, Nx+1)の3次元とします。
これは(i, j, k)の3次元空間を1次元配列で並べるときに内側からk, j, iの順にとっているためです。 CUDAでは変数のアドレスがスレッド番号の順に並んでいることが重要です[8]。
残差の計算はblock内で部分和をshared memoryのreduction演算で求め、部分和の配列をCPUに転送しその和をCPUで計算しています。このようにすると変数全体を転送して残差を計算することに比べると転送量が1/256倍になり、データ転送に要する時間を無視することができます。
なお、CUDAではred-black法を使用しないと反復計算が発散します。

リスト3-4-1 CUDAを用いた反復計算部の並列化（sor.cuの一部、novectorモード）

     1	#include "ost.h"
     2	#include "ost_cuda.h"
     3	#include "ost_prototype.h"
     4	#include "reduction_sum.cu"
     5	
     6	__host__ __device__
     7	static real_t update(
     8		int i, int j, int k, int64_t ni, int64_t nj, int64_t nk, int64_t n,
     9		real_t *rxp, real_t *ryp, real_t *rzp, real_t *rxm, real_t *rym, real_t *rzm,
    10		real_t *v, id_t *idvolt, id_t *idepsr, real_t *epsr, real_t omega)
    11	{
    12		const real_t e = epsr[idepsr[n]];
    13	
    14		const real_t axp = (e + epsr[idepsr[n + ni]]) * rxp[i];
    15		const real_t axm = (e + epsr[idepsr[n - ni]]) * rxm[i];
    16		const real_t ayp = (e + epsr[idepsr[n + nj]]) * ryp[j];
    17		const real_t aym = (e + epsr[idepsr[n - nj]]) * rym[j];
    18		const real_t azp = (e + epsr[idepsr[n + nk]]) * rzp[k];
    19		const real_t azm = (e + epsr[idepsr[n - nk]]) * rzm[k];
    20	
    21		const real_t res = (idvolt[n] ? 0 : 1) * omega * ((
    22			axp * v[n + ni] +
    23			axm * v[n - ni] +
    24			ayp * v[n + nj] +
    25			aym * v[n - nj] +
    26			azp * v[n + nk] +
    27			azm * v[n - nk]) / (axp + axm + ayp + aym + azp + azm) - v[n]);
    28	
    29		v[n] += res;
    30	
    31		return (res * res);
    32	}
    33	
    34	__global__
    35	static void update_gpu(int oe,
    36		int nx, int ny, int nz,
    37		int64_t ni, int64_t nj, int64_t nk,
    38		real_t *rxp, real_t *ryp, real_t *rzp, real_t *rxm, real_t *rym, real_t *rzm,
    39		real_t *v, id_t *idvolt, id_t *idepsr, real_t *epsr, real_t omega, real_t *d_res2)
    40	{
    41		extern __shared__ real_t sm[];
    42	
    43		const int i = blockIdx.z;
    44		const int j = threadIdx.y + (blockIdx.y * blockDim.y);
    45		const int k = threadIdx.x + (blockIdx.x * blockDim.x);
    46	
    47		const int nthread = blockDim.x * blockDim.y;
    48		const int tid = threadIdx.x + (threadIdx.y * blockDim.x);
    49		const int bid = blockIdx.x + (blockIdx.y * gridDim.x) + (blockIdx.z * gridDim.x * gridDim.y);
    50	
    51		real_t res2 = 0;
    52	
    53		if (i <= nx) {
    54		if (j <= ny) {
    55		if (k <= nz) {
    56			if ((i + j + k) % 2 == oe) {
    57				const int64_t n = ((i + 1) * ni) + ((j + 1) * nj) + ((k + 1) * nk);
    58				res2 = update(
    59					i, j, k, ni, nj, nk, n,
    60					rxp, ryp, rzp, rxm, rym, rzm,
    61					v, idvolt, idepsr, epsr, omega);
    62			}
    63		}
    64		}
    65		}
    66		sm[tid] = res2;
    67	
    68		reduction_sum(tid, nthread, sm, &d_res2[bid]);
    69	}

3.4.2 CUDAの計算時間

表3-4-1にCUDAの計算時間を示します。
novectorモードとvectorモードはほぼ同じです。
以上からGPUでもCPUと同じく、使用メモリーが少ないnovectorモードを推奨します。

表3-4-1 CUDAの計算時間（単精度）

ベンチマーク	novector	vector
300	2.5秒	2.5秒
400	6.0秒	6.0秒
500	11.4秒	11.3秒
600	19.3秒	19.2秒