# NVIDIA cuTile Python 教程：在 Colab 中构建用于向量加法、矩阵加法和矩阵乘法的 Tiled GPU 内核

- 来源：MarkTechPost（RSS）
- 作者：Sana Hassan
- 发布时间：2026-06-09 16:37
- AIHOT 分数：70
- AIHOT 标记：精选
- AIHOT 链接：https://aihot.virxact.com/items/cmq6e6gmb07n9sl5i3gujrjpe
- 原文链接：https://www.marktechpost.com/2026/06/09/nvidia-cutile-python-tutorial-building-tiled-gpu-kernels-for-vector-addition-matrix-addition-and-matrix-multiplication-in-colab

## 精选理由

NVIDIA cuTile把GPU tiled kernel编程的门槛拉低到Python，这个Colab教程从环境搭建到矩阵乘法全链路，想自己写算子的人可以跟着跑一遍。

## AI 摘要

该教程基于 NVIDIA cuTile Python 实现了分块 GPU 内核编程工作流，在 Colab 环境中配置 GPU、驱动、CUDA 及 cuTile 可用性后，分别构建了 tiled 向量加法、矩阵加法和矩阵乘法核函数，并以 PyTorch 作为回退保持 notebook 可执行。每一步均通过 PyTorch 验证结果正确性，并基准测试了各阶段的中位运行时间。

## 正文

在本教程中，我们将实现一个针对 NVIDIA cuTile Python 的高级实践工作流。cuTile Python 是一种基于 tile（分块）的 GPU 编程接口，可直接用 Python 编写高效的 CUDA 风格内核。我们首先准备一个适用于 Colab 的环境，在运行任何内核代码之前检查可用的 GPU、驱动、CUDA 及 cuTile 安装情况。接着，我们构建基于 tile 的向量加法、矩阵加法和矩阵乘法示例，并保留 PyTorch 作为后备方案。这样，即使 Colab 不满足 cuTile 的最新运行时要求，该 notebook 依然可以执行。通过这种方法，我们将理解分块编程的工作原理，以及如何加载、计算、存储和验证张量，并了解如何将自定义 GPU 内核与标准 PyTorch 操作进行比较。

在 Colab 中设置 NVIDIA cuTile Python 并检查 GPU、CUDA 及驱动运行时

复制代码已复制 请使用其他浏览器

import os
import sys
import math
import time
import json
import shutil
import subprocess
import textwrap
import warnings
warnings.filterwarnings("ignore")
def run_cmd(cmd, check=False, capture=True):
print(f"\n$ {cmd}")
result = subprocess.run(
cmd,
shell=True,
text=True,
capture_output=capture
)
if capture:
if result.stdout.strip():
print(result.stdout.strip())
if result.stderr.strip():
print(result.stderr.strip())
if check and result.returncode != 0:
raise RuntimeError(f"Command failed: {cmd}")
return result
print("=" * 90)
print("cuTile Python Advanced Colab Tutorial")
print("=" * 90)
print("\n[1] Installing Python dependencies")
run_cmd(f"{sys.executable} -m pip install -q -U pip setuptools wheel", check=False)
run_cmd(f"{sys.executable} -m pip install -q -U torch numpy pandas matplotlib", check=False)
print("\n[2] Trying to install cuTile Python")
print("Package name on PyPI: cuda-tile[tileiras]")
install_result = run_cmd(
f'{sys.executable} -m pip install -q -U "cuda-tile[tileiras]"',
check=False
)
print("\n[3] Runtime and GPU diagnostics")
run_cmd("python --version", check=False)
run_cmd("nvidia-smi", check=False)
try:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
except Exception as e:
raise RuntimeError(f"Core dependency import failed: {e}")
cuda_available = torch.cuda.is_available()
print(f"\nPyTorch CUDA available: {cuda_available}")
if cuda_available:
device_name = torch.cuda.get_device_name(0)
capability = torch.cuda.get_device_capability(0)
print(f"GPU: {device_name}")
print(f"Compute capability: sm_{capability[0]}{capability[1]}")
else:
print("No CUDA GPU detected. Colab: Runtime -> Change runtime type -> GPU")
def parse_driver_major():
try:
out = subprocess.check_output(
"nvidia-smi --query-gpu=driver_version --format=csv,noheader",
shell=True,
text=True
).strip().splitlines()[0]
return int(out.split(".")[0]), out
except Exception:
return None, None
driver_major, driver_full = parse_driver_major()
print(f"NVIDIA driver version: {driver_full}")
ct = None
cutile_import_ok = False
try:
import cuda.tile as ct
cutile_import_ok = True
print("cuda.tile import: OK")
except Exception as e:
print("cuda.tile import: FAILED")
print(str(e))
likely_runtime_ok = (
cuda_available
and cutile_import_ok
and driver_major is not None
and driver_major >= 580
)
if likely_runtime_ok:
print("\ncuTile path is enabled.")
else:
print("\ncuTile path is not enabled in this runtime.")
print("The tutorial will still run using a PyTorch fallback.")
print("For real cuTile execution, use a runtime with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")
DEVICE = "cuda" if cuda_available else "cpu"

我们准备 Colab 环境，安装所需的 Python 包并尝试安装 cuTile Python。随后检查可用的运行时环境，包括 Python、GPU、CUDA 以及 NVIDIA 驱动的可用性。我们还将决定该 notebook 是使用真正的 cuTile 后端，还是继续使用 PyTorch 后备方案。

为 cuTile 内核构建计时、正确性及基准报告工具

复制代码已复制 请使用其他浏览器

print("\n" + "=" * 90)
print("[4] Utilities: timing, correctness checks, and compact reporting")
print("=" * 90)
def sync():
if torch.cuda.is_available():
torch.cuda.synchronize()
def benchmark(fn, warmup=5, repeat=20, label="function"):
for _ in range(warmup):
fn()
sync()
times = []
for _ in range(repeat):
start = time.perf_counter()
out = fn()
sync()
end = time.perf_counter()
times.append((end - start) * 1000)
return {
"label": label,
"mean_ms": float(np.mean(times)),
"median_ms": float(np.median(times)),
"min_ms": float(np.min(times)),
"max_ms": float(np.max(times)),
}
def show_result_table(rows, title):
df = pd.DataFrame(rows)
print("\n" + title)
print(df.to_string(index=False))
return df
def assert_close(name, actual, expected, atol=1e-4, rtol=1e-4):
torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
print(f"{name}: correctness check passed")

我们定义辅助函数，使教程更易于运行、测试和基准测试。我们同步 GPU 执行，在多次重复中测量运行时间，并将基准测试结果整理为可读的表格。我们还添加了一个正确性检查函数，用于将每个自定义操作与预期的 PyTorch 输出进行比较。

定义用于向量加法、矩阵加法和矩阵乘法的分块 cuTile 内核

复制代码已复制 请使用其他浏览器

print("\n" + "=" * 90)
print("[5] cuTile kernels are defined only if cuda.tile imports successfully")
print("=" * 90)
if cutile_import_ok:
ConstInt = ct.Constant[int]
@ct.kernel
def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
a_tile = ct.load(a, index=(bid,), shape=(TILE,))
b_tile = ct.load(b, index=(bid,), shape=(TILE,))
c_tile = a_tile + b_tile
ct.store(c, index=(bid,), tile=c_tile)
@ct.kernel
def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
a_tile = ct.gather(a, offsets)
b_tile = ct.gather(b, offsets)
c_tile = a_tile + b_tile
ct.scatter(c, offsets, c_tile)
@ct.kernel
def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
rows = rows[:, None]
cols = cols[None, :]
a_tile = ct.gather(a, (rows, cols))
b_tile = ct.gather(b, (rows, cols))
c_tile = a_tile + b_tile
ct.scatter(c, (rows, cols), c_tile)
@ct.kernel
def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
acc = ct.full((TM, TN), 0, dtype=ct.float32)
zero_pad = ct.PaddingMode.ZERO
compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
for k in range(num_tiles_k):
a_tile = ct.load(
A,
index=(bid_m, k),
shape=(TM, TK),
padding_mode=zero_pad
).astype(compute_dtype)
b_tile = ct.load(
B,
index=(k, bid_n),
shape=(TK, TN),
padding_mode=zero_pad
).astype(compute_dtype)
acc = ct.mma(a_tile, b_tile, acc)
out = ct.astype(acc, C.dtype)
ct.store(C, index=(bid_m, bid_n), tile=out)
else:
print("Skipping cuTile kernel definitions because cuda.tile is unavailable.")
print("\n" + "=" * 90)
print("[6] High-level wrappers")
print("=" * 90)
def vec_add_tutorial(a, b, use_gather=True):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
grid = (math.ceil(a.numel() / TILE), 1, 1)
kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
return c
return a + b
def matrix_add_tutorial(a, b):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE_M = 16
TILE_N = 64
grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matrix_add_gather_kernel,
(a, b, c, TILE_M, TILE_N)
)
return c
return a + b
def matmul_tutorial(A, B):
if A.shape[1] != B.shape[0]:
raise ValueError("A.shape[1] must equal B.shape[0]")
if likely_runtime_ok and A.is_cuda:
if A.dtype in (torch.float16, torch.bfloat16):
TM, TN, TK = 128, 128, 64
else:
TM, TN, TK = 32, 32, 32
C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matmul_kernel,
(A, B, C, TM, TN, TK)
)
return C
return A @ B
print("Wrappers ready.")
print(f"Execution backend: {'cuTile' if likely_runtime_ok else 'PyTorch fallback'}")

我们在 `cuda.tile` 可用时，定义了用于向量加法、矩阵加法和矩阵乘法的核心 cuTile 内核。通过分块加载（tiled load）、存储（store）、收集（gather）、散射（scatter）以及矩阵乘法操作，展示了 cuTile 中 GPU 计算的结构。接着，我们将这些内核封装在 Python 函数中，当当前运行时环境不支持 cuTile 时，这些函数会自动回退到 PyTorch。

运行分块示例并针对 PyTorch 验证 float32 和 float16 Matmul

复制代码 已复制 请使用其他浏览器

print("\n" + "=" * 90)
print("[7] Example 1: tiled vector addition")
print("=" * 90)
torch.manual_seed(42)
N = 1_000_003
a = torch.randn(N, device=DEVICE, dtype=torch.float32)
b = torch.randn(N, device=DEVICE, dtype=torch.float32)
c = vec_add_tutorial(a, b, use_gather=True)
expected = a + b
assert_close("Vector addition", c, expected)
print(f"Input shape: {tuple(a.shape)}")
print(f"Output shape: {tuple(c.shape)}")
print(f"First five output values: {c[:5].detach().cpu().numpy()}")
print("\n" + "=" * 90)
print("[8] Example 2: tiled matrix addition with boundary-safe gather/scatter")
print("=" * 90)
M, N = 777, 1001
A = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
B = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
C = matrix_add_tutorial(A, B)
expected = A + B
assert_close("Matrix addition", C, expected)
print(f"A shape: {tuple(A.shape)}")
print(f"B shape: {tuple(B.shape)}")
print(f"C shape: {tuple(C.shape)}")
print("\n" + "=" * 90)
print("[9] Example 3: tiled matrix multiplication")
print("=" * 90)
M, K, N = 512, 768, 384
A32 = torch.randn(M, K, device=DEVICE, dtype=torch.float32)
B32 = torch.randn(K, N, device=DEVICE, dtype=torch.float32)
if DEVICE == "cuda":
torch.set_float32_matmul_precision("high")
C32 = matmul_tutorial(A32, B32)
expected32 = A32 @ B32
if DEVICE == "cuda":
atol, rtol = 1e-2, 1e-2
else:
atol, rtol = 1e-4, 1e-4
assert_close("Float32 matmul", C32, expected32, atol=atol, rtol=rtol)
print(f"A32 shape: {tuple(A32.shape)}")
print(f"B32 shape: {tuple(B32.shape)}")
print(f"C32 shape: {tuple(C32.shape)}")
print("\n" + "=" * 90)
print("[10] Example 4: half precision matmul")
print("=" * 90)
if DEVICE == "cuda":
A16 = torch.randn(M, K, device=DEVICE, dtype=torch.float16)
B16 = torch.randn(K, N, device=DEVICE, dtype=torch.float16)
C16 = matmul_tutorial(A16, B16)
expected16 = A16 @ B16
assert_close("Float16 matmul", C16, expected16, atol=5e-2, rtol=5e-2)
print(f"A16 shape: {tuple(A16.shape)}")
print(f"B16 shape: {tuple(B16.shape)}")
print(f"C16 shape: {tuple(C16.shape)}")
else:
print("Skipping float16 GPU matmul because CUDA is unavailable.")

我们实际运行了分块向量加法、分块矩阵加法、float32 矩阵乘法和 float16 矩阵乘法的示例。生成随机张量，执行教程中的函数，并将结果与标准 PyTorch 操作进行比较。同时打印张量形状和示例输出，以确认每个阶段的表现符合预期。

对 cuTile 操作进行基准测试，并与 PyTorch 对比中位运行时间并可视化

复制代码 已复制 请使用其他浏览器

print("\n" + "=" * 90)
print("[11] Benchmarks")
print("=" * 90)
bench_rows = []
bench_rows.append(
benchmark(
lambda: vec_add_tutorial(a, b, use_gather=True),
label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} vector add"
)
)
bench_rows.append(
benchmark(
lambda: a + b,
label="PyTorch vector add"
)
)
bench_rows.append(
benchmark(
lambda: matrix_add_tutorial(A, B),
label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} matrix add"
)
)
bench_rows.append(
benchmark(
lambda: A + B,
label="PyTorch matrix add"
)
)
bench_rows.append(
benchmark(
lambda: matmul_tutorial(A32, B32),
label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} fp32 matmul"
)
)
bench_rows.append(
benchmark(
lambda: A32 @ B32,
label="PyTorch fp32 matmul"
)
)
bench_df = show_result_table(bench_rows, "Benchmark summary in milliseconds")
print("\n" + "=" * 90)
print("[12] Simple benchmark visualization")
print("=" * 90)
try:
plt.figure(figsize=(10, 5))
plt.bar(bench_df["label"], bench_df["median_ms"])
plt.xticks(rotation=35, ha="right")
plt.ylabel("Median time in ms")
plt.title("cuTile tutorial benchmark comparison")
plt.tight_layout()
plt.show()
except Exception as e:
print(f"Plot skipped: {e}")
print("\n" + "=" * 90)
print("[13] What to change next")
print("=" * 90)
next_steps = [
{
"experiment": "Tile size sweep",
"what_to_change": "Change TILE, TILE_M, TILE_N, TM, TN, and TK",
"why_it_matters": "Tile shape controls memory access, occupancy, and Tensor Core usage"
},
{
"experiment": "Non-multiple dimensions",
"what_to_change": "Use dimensions like 1003 x 771",
"why_it_matters": "Tests padding, gather/scatter, and boundary behavior"
},
{
"experiment": "Precision comparison",
"what_to_change": "Compare float32, float16, and bfloat16",
"why_it_matters": "Tensor Core paths are strongest for reduced precision"
},
{
"experiment": "Operation fusion",
"what_to_change": "Extend vector add to compute c = relu(a + b)",
"why_it_matters": "Fusion reduces memory traffic and is a common GPU-kernel optimization"
},
{
"experiment": "Attention kernel study",
"what_to_change": "Study the repo's AttentionFMHA.py sample",
"why_it_matters": "Attention shows why tiled kernels matter for transformer workloads"
}
]
next_df = pd.DataFrame(next_steps)
print(next_df.to_string(index=False))
print("\n" + "=" * 90)
print("Tutorial completed.")
print("=" * 90)
if likely_runtime_ok:
print("Real cuTile kernels were used.")
else:
print("This runtime used the PyTorch fallback.")
print("To run real cuTile kernels, use a GPU machine with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")

我们对教程中的操作进行基准测试，将其与等价 PyTorch 操作的中位运行时间进行比较。然后通过简单的柱状图可视化基准结果，使性能对比更加直观。最后，列出后续可行的实验方向，例如分块大小调优、精度对比、算子融合，以及对高级 cuTile 示例（如 attention）的研究。

结论

总而言之，我们拥有了一套完整的 cuTile Python 工作流，涵盖了环境配置、内核定义、执行、验证和基准测试。我们实现了直接分块操作、基于 gather/scatter 的索引以及分块矩阵乘法，并在每个阶段通过 PyTorch 输出验证了正确性。回退路径使得该教程对 Colab 用户同样实用，而 cuTile 路径则展示了相同的结构如何在兼容的 NVIDIA GPU 环境中运行。这为我们在分块大小、精度格式、融合操作，以及更高级的 GPU 工作负载（如 attention、layer normalization 和自定义深度学习内核）方面进行实验，提供了一个起点。