cuda: add test for device linking
diff --git a/test cases/cuda/17 separate compilation linking/b.cu b/test cases/cuda/17 separate compilation linking/b.cu
new file mode 100644
index 0000000..33ff561
--- /dev/null
+++ b/test cases/cuda/17 separate compilation linking/b.cu
@@ -0,0 +1,5 @@
+#include "b.h"
+
+__device__ int g[N];
+
+__device__ void bar(void) { g[threadIdx.x]++; }
diff --git a/test cases/cuda/17 separate compilation linking/b.h b/test cases/cuda/17 separate compilation linking/b.h
new file mode 100644
index 0000000..d8a0efc
--- /dev/null
+++ b/test cases/cuda/17 separate compilation linking/b.h
@@ -0,0 +1,5 @@
+#define N 8
+
+extern __device__ int g[N];
+
+extern __device__ void bar(void);
diff --git a/test cases/cuda/17 separate compilation linking/main.cu b/test cases/cuda/17 separate compilation linking/main.cu
new file mode 100644
index 0000000..b07d01b
--- /dev/null
+++ b/test cases/cuda/17 separate compilation linking/main.cu
@@ -0,0 +1,44 @@
+#include <stdio.h>
+
+#include "b.h"
+
+__global__ void foo(void)
+{
+    __shared__ int a[N];
+    a[threadIdx.x] = threadIdx.x;
+
+    __syncthreads();
+
+    g[threadIdx.x] = a[blockDim.x - threadIdx.x - 1];
+
+    bar();
+}
+
+int main(void)
+{
+    unsigned int i;
+    int *dg, hg[N];
+    int sum = 0;
+
+    foo<<<1, N>>>();
+
+    if (cudaGetSymbolAddress((void**)&dg, g)) {
+        printf("couldn't get the symbol addr\n");
+        return 1;
+    }
+    if (cudaMemcpy(hg, dg, N * sizeof(int), cudaMemcpyDeviceToHost)) {
+        printf("couldn't memcpy\n");
+        return 1;
+    }
+
+    for (i = 0; i < N; i++) {
+        sum += hg[i];
+    }
+    if (sum == 36) {
+        printf("PASSED\n");
+    } else {
+        printf("FAILED (%d)\n", sum);
+    }
+
+    return 0;
+}
diff --git a/test cases/cuda/17 separate compilation linking/meson.build b/test cases/cuda/17 separate compilation linking/meson.build
new file mode 100644
index 0000000..ee86123
--- /dev/null
+++ b/test cases/cuda/17 separate compilation linking/meson.build
@@ -0,0 +1,19 @@
+# example here is inspired by Nvidia's blog post:
+#   https://developer.nvidia.com/blog/separate-compilation-linking-cuda-device-code/
+# code:
+#   https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#examples
+
+project('device linking', ['cpp', 'cuda'], version : '1.0.0')
+
+nvcc = meson.get_compiler('cuda')
+cuda = import('unstable-cuda')
+
+arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Auto', detected : ['8.0'])
+
+message('NVCC version:   ' + nvcc.version())
+message('NVCC flags:     ' + ' '.join(arch_flags))
+
+# test device linking with -dc (which is equivalent to `--relocatable-device-code true`)
+lib = static_library('devicefuncs', ['b.cu'], cuda_args : ['-dc'] + arch_flags)
+exe = executable('app', 'main.cu', cuda_args : ['-dc'] + arch_flags, link_with : lib, link_args : arch_flags)
+test('cudatest', exe)