GPU Puzzles

This article provides solutions to the GPU Puzzles by Sasha Rush.

For some challenging puzzles, we include explanations as comments within the code.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
def map_spec(a):
    return a + 10


def map_test(cuda):
    def call(out, a) -> None:
        local_i = cuda.threadIdx.x
        out[local_i] = a[local_i] + 10

    return call
1
2
3
4
5
6
7
8
9
def zip_spec(a, b):
    return a + b

def zip_test(cuda):
    def call(out, a, b) -> None:
        local_i = cuda.threadIdx.x
        out[local_i] = a[local_i] + b[local_i]

    return call
1
2
3
4
5
6
7
def map_guard_test(cuda):
    def call(out, a, size) -> None:
        local_i = cuda.threadIdx.x
        if local_i < size:
            out[local_i] = a[local_i] + 10

    return call
1
2
3
4
5
6
7
8
def map_2D_test(cuda):
    def call(out, a, size) -> None:
        local_i = cuda.threadIdx.x
        local_j = cuda.threadIdx.y
        if local_i < size and local_j < size:
            out[local_i, local_j] = a[local_i, local_j] + 10

    return call
1
2
3
4
5
6
7
8
def broadcast_test(cuda):
    def call(out, a, b, size) -> None:
        local_i = cuda.threadIdx.x
        local_j = cuda.threadIdx.y
        if local_i < size and local_j < size:
            out[local_i, local_j] = a[local_i, 0] + b[0, local_j]

    return call
1
2
3
4
5
6
7
def map_block_test(cuda):
    def call(out, a, size) -> None:
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        if i < size:
            out[i] = a[i] + 10

    return call
1
2
3
4
5
6
7
8
def map_block2D_test(cuda):
    def call(out, a, size) -> None:
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
        if i < size and j < size:
            out[i, j] = a[i, j] + 10

    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
def shared_test(cuda):
    def call(out, a, size) -> None:
        shared = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x

        if i < size:
            shared[local_i] = a[i]
            cuda.syncthreads()

        out[i] = shared[local_i] + 10

    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def pool_spec(a):
    out = np.zeros(*a.shape)
    for i in range(a.shape[0]):
        out[i] = a[max(i - 2, 0) : i + 1].sum()
    return out


TPB = 8
def pool_test(cuda):
    def call(out, a, size) -> None:
        shared = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x
        if i < size:
            shared[local_i] = a[i]
            cuda.syncthreads()
            total = shared[local_i]
            if local_i >= 1:
                total += shared[local_i - 1]
            if local_i >= 2:
                total += shared[local_i - 2]
            out[i] = total

    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def dot_spec(a, b):
    return a @ b

TPB = 8
def dot_test(cuda):
    def call(out, a, b, size) -> None:
        shared = cuda.shared.array(TPB, numba.float32)

        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x
        partial = 0.0
        if i < size:
            # 2 Global Reads: a[i] and b[i]
            partial = a[i] * b[i]

        shared[local_i] = partial
        cuda.syncthreads()

        # Perform parallel reduction to sum all partial products
        # eg: first round, shared[0] += shared[1], shared[2] += shared[3] ...
        # second round, shared[0] += shared[2], shared[4] += shared[6]
        # third round, shared[0] += shared[4]
        stride = 1
        while stride < TPB:
            index = 2 * stride * local_i
            if index + stride < TPB:
                shared[index] += shared[index + stride]
            stride *= 2
            cuda.syncthreads()

        # After reduction, the first thread has the total dot product
        if local_i == 0:
            out[0] = shared[0]
    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def conv_spec(a, b):
    out = np.zeros(*a.shape)
    len = b.shape[0]
    for i in range(a.shape[0]):
        out[i] = sum([a[i + j] * b[j] for j in range(len) if i + j < a.shape[0]])
    return out


MAX_CONV = 4
TPB = 8
TPB_MAX_CONV = TPB + MAX_CONV
def conv_test(cuda):
    def call(out, a, b, a_size, b_size) -> None:
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x

        shared_a = cuda.shared.array(TPB_MAX_CONV, numba.float32)
        shared_b = cuda.shared.array(MAX_CONV, numba.float32)

        if i < a_size:
           shared_a[local_i] = a[i]
        if local_i < b_size:
           shared_b[local_i] = b[local_i]
        else:
           # using the idle threads to read extra elements for shared_a
           # this makes sure two global reads
           # eg: local_i=4, b_size=4, local_i2=0
           local_i2 = local_i - b_size
           i2 = i - b_size
           if i2 + TPB < a_size and local_i2 < b_size:
              shared_a[TPB + local_i2] = a[i2 + TPB]
        cuda.syncthreads()

        acc = 0
        for k in range(b_size):
            acc += shared_a[local_i + k] * shared_b[k]
        if i < a_size:
            out[i] = acc

    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
TPB = 8
def sum_spec(a):
    out = np.zeros((a.shape[0] + TPB - 1) // TPB)
    for j, i in enumerate(range(0, a.shape[-1], TPB)):
        out[j] = a[i : i + TPB].sum()
    return out


def sum_test(cuda):
    def call(out, a, size: int) -> None:
        cache = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x

        if i < size:
            cache[local_i] = a[i]
        cuda.syncthreads()
        
        # Stride shrinks:
        #  1. cache[0] += cache[4], cache[1] += cache[5], ...
        #  2. cache[0] += cache[2], cache[1] += cache[3]
        #  3. cache[0] += cache[1]
        stride = TPB // 2
        while stride > 0:
            if local_i < stride and (local_i + stride) < TPB:
                cache[local_i] += cache[local_i + stride]
            cuda.syncthreads()
            stride = stride // 2
        
        if local_i == 0:
            out[cuda.blockIdx.x] = cache[0]


    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
TPB = 8
def sum_spec(a):
    out = np.zeros((a.shape[0], (a.shape[1] + TPB - 1) // TPB))
    for j, i in enumerate(range(0, a.shape[-1], TPB)):
        out[..., j] = a[..., i : i + TPB].sum(-1)
    return out


def axis_sum_test(cuda):
    def call(out, a, size: int) -> None:
        cache = cuda.shared.array(TPB, numba.float32)
        i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        local_i = cuda.threadIdx.x
        batch = cuda.blockIdx.y

        if i < size:
            cache[local_i] = a[batch, i]

        cuda.syncthreads()
        
        # Almost same to previous one
        stride = TPB // 2
        while stride > 0:
            if local_i < stride and i + stride < size:
                cache[local_i] += cache[local_i + stride]
            cuda.syncthreads()
            stride = stride // 2
        
        if local_i == 0:
            out[batch, cuda.blockIdx.x] = cache[0]

    return call
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def matmul_spec(a, b):
    return a @ b


TPB = 3
def mm_oneblock_test(cuda):
    def call(out, a, b, size: int) -> None:
        a_shared = cuda.shared.array((TPB, TPB), numba.float32)
        b_shared = cuda.shared.array((TPB, TPB), numba.float32)

        row = cuda.blockIdx.y * TPB + cuda.threadIdx.y
        col = cuda.blockIdx.x * TPB + cuda.threadIdx.x
        local_row = cuda.threadIdx.y
        local_col = cuda.threadIdx.x

        tmp = 0.0

        # Run with tiles:
        # this is because threads number are limited in one block
        # so we should read first, calculate, and move to the next region, until done.
        for m in range((size + TPB - 1) // TPB):
            a_row = row
            a_col = m * TPB + local_col
            b_row = m * TPB + local_row
            b_col = col

            if a_row < size and a_col < size:
                a_shared[local_row, local_col] = a[a_row, a_col]
            else:
                a_shared[local_row, local_col] = 0.0

            if b_row < size and b_col < size:
                b_shared[local_row, local_col] = b[b_row, b_col]
            else:
                b_shared[local_row, local_col] = 0.0

            cuda.syncthreads()
  
            for k in range(TPB):
                tmp += a_shared[local_row, k] * b_shared[k, local_col]

        if row < size and col < size:
            out[row, col] = tmp

    return call