NVIDIA cuTile Python 教程：在 Colab 中构建用于向量加法、矩阵加法和矩阵乘法的 Tiled GPU 内核

2026-06-09 16:37·23天前·Sana Hassan

精选理由

NVIDIA cuTile把GPU tiled kernel编程的门槛拉低到Python，这个Colab教程从环境搭建到矩阵乘法全链路，想自己写算子的人可以跟着跑一遍。

AI 摘要

该教程基于 NVIDIA cuTile Python 实现了分块 GPU 内核编程工作流，在 Colab 环境中配置 GPU、驱动、CUDA 及 cuTile 可用性后，分别构建了 tiled 向量加法、矩阵加法和矩阵乘法核函数，并以 PyTorch 作为回退保持 notebook 可执行。每一步均通过 PyTorch 验证结果正确性，并基准测试了各阶段的中位运行时间。

AI 翻译 · 中文

在本教程中，我们将实现一个针对 NVIDIA cuTile Python 的高级实践工作流。cuTile Python 是一种基于 tile（分块）的 GPU 编程接口，可直接用 Python 编写高效的 CUDA 风格内核。我们首先准备一个适用于 Colab 的环境，在运行任何内核代码之前检查可用的 GPU、驱动、CUDA 及 cuTile 安装情况。接着，我们构建基于 tile 的向量加法、矩阵加法和矩阵乘法示例，并保留 PyTorch 作为后备方案。这样，即使 Colab 不满足 cuTile 的最新运行时要求，该 notebook 依然可以执行。通过这种方法，我们将理解分块编程的工作原理，以及如何加载、计算、存储和验证张量，并了解如何将自定义 GPU 内核与标准 PyTorch 操作进行比较。

在 Colab 中设置 NVIDIA cuTile Python 并检查 GPU、CUDA 及驱动运行时

复制代码已复制请使用其他浏览器

import os
import sys
import math
import time
import json
import shutil
import subprocess
import textwrap
import warnings
warnings.filterwarnings("ignore")
def run_cmd(cmd, check=False, capture=True):
   print(f"\n$ {cmd}")
   result = subprocess.run(
       cmd,
       shell=True,
       text=True,
       capture_output=capture
   )
   if capture:
       if result.stdout.strip():
           print(result.stdout.strip())
       if result.stderr.strip():
           print(result.stderr.strip())
   if check and result.returncode != 0:
       raise RuntimeError(f"Command failed: {cmd}")
   return result
print("=" * 90)
print("cuTile Python Advanced Colab Tutorial")
print("=" * 90)
print("\n[1] Installing Python dependencies")
run_cmd(f"{sys.executable} -m pip install -q -U pip setuptools wheel", check=False)
run_cmd(f"{sys.executable} -m pip install -q -U torch numpy pandas matplotlib", check=False)
print("\n[2] Trying to install cuTile Python")
print("Package name on PyPI: cuda-tile[tileiras]")
install_result = run_cmd(
   f'{sys.executable} -m pip install -q -U "cuda-tile[tileiras]"',
   check=False
)
print("\n[3] Runtime and GPU diagnostics")
run_cmd("python --version", check=False)
run_cmd("nvidia-smi", check=False)
try:
   import torch
   import numpy as np
   import pandas as pd
   import matplotlib.pyplot as plt
except Exception as e:
   raise RuntimeError(f"Core dependency import failed: {e}")
cuda_available = torch.cuda.is_available()
print(f"\nPyTorch CUDA available: {cuda_available}")
if cuda_available:
   device_name = torch.cuda.get_device_name(0)
   capability = torch.cuda.get_device_capability(0)
   print(f"GPU: {device_name}")
   print(f"Compute capability: sm_{capability[0]}{capability[1]}")
else:
   print("No CUDA GPU detected. Colab: Runtime -> Change runtime type -> GPU")
def parse_driver_major():
   try:
       out = subprocess.check_output(
           "nvidia-smi --query-gpu=driver_version --format=csv,noheader",
           shell=True,
           text=True
       ).strip().splitlines()[0]
       return int(out.split(".")[0]), out
   except Exception:
       return None, None
driver_major, driver_full = parse_driver_major()
print(f"NVIDIA driver version: {driver_full}")
ct = None
cutile_import_ok = False
try:
   import cuda.tile as ct
   cutile_import_ok = True
   print("cuda.tile import: OK")
except Exception as e:
   print("cuda.tile import: FAILED")
   print(str(e))
likely_runtime_ok = (
   cuda_available
   and cutile_import_ok
   and driver_major is not None
   and driver_major >= 580
)
if likely_runtime_ok:
   print("\ncuTile path is enabled.")
else:
   print("\ncuTile path is not enabled in this runtime.")
   print("The tutorial will still run using a PyTorch fallback.")
   print("For real cuTile execution, use a runtime with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")
DEVICE = "cuda" if cuda_available else "cpu"

我们准备 Colab 环境，安装所需的 Python 包并尝试安装 cuTile Python。随后检查可用的运行时环境，包括 Python、GPU、CUDA 以及 NVIDIA 驱动的可用性。我们还将决定该 notebook 是使用真正的 cuTile 后端，还是继续使用 PyTorch 后备方案。

为 cuTile 内核构建计时、正确性及基准报告工具

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[4] Utilities: timing, correctness checks, and compact reporting")
print("=" * 90)
def sync():
   if torch.cuda.is_available():
       torch.cuda.synchronize()
def benchmark(fn, warmup=5, repeat=20, label="function"):
   for _ in range(warmup):
       fn()
   sync()
   times = []
   for _ in range(repeat):
       start = time.perf_counter()
       out = fn()
       sync()
       end = time.perf_counter()
       times.append((end - start) * 1000)
   return {
       "label": label,
       "mean_ms": float(np.mean(times)),
       "median_ms": float(np.median(times)),
       "min_ms": float(np.min(times)),
       "max_ms": float(np.max(times)),
   }
def show_result_table(rows, title):
   df = pd.DataFrame(rows)
   print("\n" + title)
   print(df.to_string(index=False))
   return df
def assert_close(name, actual, expected, atol=1e-4, rtol=1e-4):
   torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
   print(f"{name}: correctness check passed")

我们定义辅助函数，使教程更易于运行、测试和基准测试。我们同步 GPU 执行，在多次重复中测量运行时间，并将基准测试结果整理为可读的表格。我们还添加了一个正确性检查函数，用于将每个自定义操作与预期的 PyTorch 输出进行比较。

定义用于向量加法、矩阵加法和矩阵乘法的分块 cuTile 内核

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[5] cuTile kernels are defined only if cuda.tile imports successfully")
print("=" * 90)
if cutile_import_ok:
   ConstInt = ct.Constant[int]
   @ct.kernel
   def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       a_tile = ct.load(a, index=(bid,), shape=(TILE,))
       b_tile = ct.load(b, index=(bid,), shape=(TILE,))
       c_tile = a_tile + b_tile
       ct.store(c, index=(bid,), tile=c_tile)
   @ct.kernel
   def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
       a_tile = ct.gather(a, offsets)
       b_tile = ct.gather(b, offsets)
       c_tile = a_tile + b_tile
       ct.scatter(c, offsets, c_tile)
   @ct.kernel
   def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
       cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
       rows = rows[:, None]
       cols = cols[None, :]
       a_tile = ct.gather(a, (rows, cols))
       b_tile = ct.gather(b, (rows, cols))
       c_tile = a_tile + b_tile
       ct.scatter(c, (rows, cols), c_tile)
   @ct.kernel
   def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
       acc = ct.full((TM, TN), 0, dtype=ct.float32)
       zero_pad = ct.PaddingMode.ZERO
       compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
       for k in range(num_tiles_k):
           a_tile = ct.load(
               A,
               index=(bid_m, k),
               shape=(TM, TK),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           b_tile = ct.load(
               B,
               index=(k, bid_n),
               shape=(TK, TN),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           acc = ct.mma(a_tile, b_tile, acc)
       out = ct.astype(acc, C.dtype)
       ct.store(C, index=(bid_m, bid_n), tile=out)
else:
   print("Skipping cuTile kernel definitions because cuda.tile is unavailable.")
print("\n" + "=" * 90)
print("[6] High-level wrappers")
print("=" * 90)
def vec_add_tutorial(a, b, use_gather=True):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
       grid = (math.ceil(a.numel() / TILE), 1, 1)
       kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
       ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
       return c
   return a + b
def matrix_add_tutorial(a, b):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE_M = 16
       TILE_N = 64
       grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matrix_add_gather_kernel,
           (a, b, c, TILE_M, TILE_N)
       )
       return c
   return a + b
def matmul_tutorial(A, B):
   if A.shape[1] != B.shape[0]:
       raise ValueError("A.shape[1] must equal B.shape[0]")
   if likely_runtime_ok and A.is_cuda:
       if A.dtype in (torch.float16, torch.bfloat16):
           TM, TN, TK = 128, 128, 64
       else:
           TM, TN, TK = 32, 32, 32
       C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
       grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matmul_kernel,
           (A, B, C, TM, TN, TK)
       )
       return C
   return A @ B
print("Wrappers ready.")
print(f"Execution backend: {'cuTile' if likely_runtime_ok else 'PyTorch fallback'}")

我们在 `cuda.tile` 可用时，定义了用于向量加法、矩阵加法和矩阵乘法的核心 cuTile 内核。通过分块加载（tiled load）、存储（store）、收集（gather）、散射（scatter）以及矩阵乘法操作，展示了 cuTile 中 GPU 计算的结构。接着，我们将这些内核封装在 Python 函数中，当当前运行时环境不支持 cuTile 时，这些函数会自动回退到 PyTorch。

运行分块示例并针对 PyTorch 验证 float32 和 float16 Matmul

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[7] Example 1: tiled vector addition")
print("=" * 90)
torch.manual_seed(42)
N = 1_000_003
a = torch.randn(N, device=DEVICE, dtype=torch.float32)
b = torch.randn(N, device=DEVICE, dtype=torch.float32)
c = vec_add_tutorial(a, b, use_gather=True)
expected = a + b
assert_close("Vector addition", c, expected)
print(f"Input shape: {tuple(a.shape)}")
print(f"Output shape: {tuple(c.shape)}")
print(f"First five output values: {c[:5].detach().cpu().numpy()}")
print("\n" + "=" * 90)
print("[8] Example 2: tiled matrix addition with boundary-safe gather/scatter")
print("=" * 90)
M, N = 777, 1001
A = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
B = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
C = matrix_add_tutorial(A, B)
expected = A + B
assert_close("Matrix addition", C, expected)
print(f"A shape: {tuple(A.shape)}")
print(f"B shape: {tuple(B.shape)}")
print(f"C shape: {tuple(C.shape)}")
print("\n" + "=" * 90)
print("[9] Example 3: tiled matrix multiplication")
print("=" * 90)
M, K, N = 512, 768, 384
A32 = torch.randn(M, K, device=DEVICE, dtype=torch.float32)
B32 = torch.randn(K, N, device=DEVICE, dtype=torch.float32)
if DEVICE == "cuda":
   torch.set_float32_matmul_precision("high")
C32 = matmul_tutorial(A32, B32)
expected32 = A32 @ B32
if DEVICE == "cuda":
   atol, rtol = 1e-2, 1e-2
else:
   atol, rtol = 1e-4, 1e-4
assert_close("Float32 matmul", C32, expected32, atol=atol, rtol=rtol)
print(f"A32 shape: {tuple(A32.shape)}")
print(f"B32 shape: {tuple(B32.shape)}")
print(f"C32 shape: {tuple(C32.shape)}")
print("\n" + "=" * 90)
print("[10] Example 4: half precision matmul")
print("=" * 90)
if DEVICE == "cuda":
   A16 = torch.randn(M, K, device=DEVICE, dtype=torch.float16)
   B16 = torch.randn(K, N, device=DEVICE, dtype=torch.float16)
   C16 = matmul_tutorial(A16, B16)
   expected16 = A16 @ B16
   assert_close("Float16 matmul", C16, expected16, atol=5e-2, rtol=5e-2)
   print(f"A16 shape: {tuple(A16.shape)}")
   print(f"B16 shape: {tuple(B16.shape)}")
   print(f"C16 shape: {tuple(C16.shape)}")
else:
   print("Skipping float16 GPU matmul because CUDA is unavailable.")

我们实际运行了分块向量加法、分块矩阵加法、float32 矩阵乘法和 float16 矩阵乘法的示例。生成随机张量，执行教程中的函数，并将结果与标准 PyTorch 操作进行比较。同时打印张量形状和示例输出，以确认每个阶段的表现符合预期。

对 cuTile 操作进行基准测试，并与 PyTorch 对比中位运行时间并可视化

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[11] Benchmarks")
print("=" * 90)
bench_rows = []
bench_rows.append(
   benchmark(
       lambda: vec_add_tutorial(a, b, use_gather=True),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} vector add"
   )
)
bench_rows.append(
   benchmark(
       lambda: a + b,
       label="PyTorch vector add"
   )
)
bench_rows.append(
   benchmark(
       lambda: matrix_add_tutorial(A, B),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} matrix add"
   )
)
bench_rows.append(
   benchmark(
       lambda: A + B,
       label="PyTorch matrix add"
   )
)
bench_rows.append(
   benchmark(
       lambda: matmul_tutorial(A32, B32),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} fp32 matmul"
   )
)
bench_rows.append(
   benchmark(
       lambda: A32 @ B32,
       label="PyTorch fp32 matmul"
   )
)
bench_df = show_result_table(bench_rows, "Benchmark summary in milliseconds")
print("\n" + "=" * 90)
print("[12] Simple benchmark visualization")
print("=" * 90)
try:
   plt.figure(figsize=(10, 5))
   plt.bar(bench_df["label"], bench_df["median_ms"])
   plt.xticks(rotation=35, ha="right")
   plt.ylabel("Median time in ms")
   plt.title("cuTile tutorial benchmark comparison")
   plt.tight_layout()
   plt.show()
except Exception as e:
   print(f"Plot skipped: {e}")
print("\n" + "=" * 90)
print("[13] What to change next")
print("=" * 90)
next_steps = [
   {
       "experiment": "Tile size sweep",
       "what_to_change": "Change TILE, TILE_M, TILE_N, TM, TN, and TK",
       "why_it_matters": "Tile shape controls memory access, occupancy, and Tensor Core usage"
   },
   {
       "experiment": "Non-multiple dimensions",
       "what_to_change": "Use dimensions like 1003 x 771",
       "why_it_matters": "Tests padding, gather/scatter, and boundary behavior"
   },
   {
       "experiment": "Precision comparison",
       "what_to_change": "Compare float32, float16, and bfloat16",
       "why_it_matters": "Tensor Core paths are strongest for reduced precision"
   },
   {
       "experiment": "Operation fusion",
       "what_to_change": "Extend vector add to compute c = relu(a + b)",
       "why_it_matters": "Fusion reduces memory traffic and is a common GPU-kernel optimization"
   },
   {
       "experiment": "Attention kernel study",
       "what_to_change": "Study the repo's AttentionFMHA.py sample",
       "why_it_matters": "Attention shows why tiled kernels matter for transformer workloads"
   }
]
next_df = pd.DataFrame(next_steps)
print(next_df.to_string(index=False))
print("\n" + "=" * 90)
print("Tutorial completed.")
print("=" * 90)
if likely_runtime_ok:
   print("Real cuTile kernels were used.")
else:
   print("This runtime used the PyTorch fallback.")
   print("To run real cuTile kernels, use a GPU machine with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")

我们对教程中的操作进行基准测试，将其与等价 PyTorch 操作的中位运行时间进行比较。然后通过简单的柱状图可视化基准结果，使性能对比更加直观。最后，列出后续可行的实验方向，例如分块大小调优、精度对比、算子融合，以及对高级 cuTile 示例（如 attention）的研究。

结论

总而言之，我们拥有了一套完整的 cuTile Python 工作流，涵盖了环境配置、内核定义、执行、验证和基准测试。我们实现了直接分块操作、基于 gather/scatter 的索引以及分块矩阵乘法，并在每个阶段通过 PyTorch 输出验证了正确性。回退路径使得该教程对 Colab 用户同样实用，而 cuTile 路径则展示了相同的结构如何在兼容的 NVIDIA GPU 环境中运行。这为我们在分块大小、精度格式、融合操作，以及更高级的 GPU 工作负载（如 attention、layer normalization 和自定义深度学习内核）方面进行实验，提供了一个起点。

MarkTechPost（RSS）

精选70导出 Markdown

NVIDIA cuTile Python 教程：在 Colab 中构建用于向量加法、矩阵加法和矩阵乘法的 Tiled GPU 内核

2026-06-09 16:37·23天前·Sana Hassan

阅读原文· marktechpost.com

精选理由

NVIDIA cuTile把GPU tiled kernel编程的门槛拉低到Python，这个Colab教程从环境搭建到矩阵乘法全链路，想自己写算子的人可以跟着跑一遍。

AI 摘要

AI 翻译 · 中文

在 Colab 中设置 NVIDIA cuTile Python 并检查 GPU、CUDA 及驱动运行时

复制代码已复制请使用其他浏览器

import os
import sys
import math
import time
import json
import shutil
import subprocess
import textwrap
import warnings
warnings.filterwarnings("ignore")
def run_cmd(cmd, check=False, capture=True):
   print(f"\n$ {cmd}")
   result = subprocess.run(
       cmd,
       shell=True,
       text=True,
       capture_output=capture
   )
   if capture:
       if result.stdout.strip():
           print(result.stdout.strip())
       if result.stderr.strip():
           print(result.stderr.strip())
   if check and result.returncode != 0:
       raise RuntimeError(f"Command failed: {cmd}")
   return result
print("=" * 90)
print("cuTile Python Advanced Colab Tutorial")
print("=" * 90)
print("\n[1] Installing Python dependencies")
run_cmd(f"{sys.executable} -m pip install -q -U pip setuptools wheel", check=False)
run_cmd(f"{sys.executable} -m pip install -q -U torch numpy pandas matplotlib", check=False)
print("\n[2] Trying to install cuTile Python")
print("Package name on PyPI: cuda-tile[tileiras]")
install_result = run_cmd(
   f'{sys.executable} -m pip install -q -U "cuda-tile[tileiras]"',
   check=False
)
print("\n[3] Runtime and GPU diagnostics")
run_cmd("python --version", check=False)
run_cmd("nvidia-smi", check=False)
try:
   import torch
   import numpy as np
   import pandas as pd
   import matplotlib.pyplot as plt
except Exception as e:
   raise RuntimeError(f"Core dependency import failed: {e}")
cuda_available = torch.cuda.is_available()
print(f"\nPyTorch CUDA available: {cuda_available}")
if cuda_available:
   device_name = torch.cuda.get_device_name(0)
   capability = torch.cuda.get_device_capability(0)
   print(f"GPU: {device_name}")
   print(f"Compute capability: sm_{capability[0]}{capability[1]}")
else:
   print("No CUDA GPU detected. Colab: Runtime -> Change runtime type -> GPU")
def parse_driver_major():
   try:
       out = subprocess.check_output(
           "nvidia-smi --query-gpu=driver_version --format=csv,noheader",
           shell=True,
           text=True
       ).strip().splitlines()[0]
       return int(out.split(".")[0]), out
   except Exception:
       return None, None
driver_major, driver_full = parse_driver_major()
print(f"NVIDIA driver version: {driver_full}")
ct = None
cutile_import_ok = False
try:
   import cuda.tile as ct
   cutile_import_ok = True
   print("cuda.tile import: OK")
except Exception as e:
   print("cuda.tile import: FAILED")
   print(str(e))
likely_runtime_ok = (
   cuda_available
   and cutile_import_ok
   and driver_major is not None
   and driver_major >= 580
)
if likely_runtime_ok:
   print("\ncuTile path is enabled.")
else:
   print("\ncuTile path is not enabled in this runtime.")
   print("The tutorial will still run using a PyTorch fallback.")
   print("For real cuTile execution, use a runtime with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")
DEVICE = "cuda" if cuda_available else "cpu"

为 cuTile 内核构建计时、正确性及基准报告工具

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[4] Utilities: timing, correctness checks, and compact reporting")
print("=" * 90)
def sync():
   if torch.cuda.is_available():
       torch.cuda.synchronize()
def benchmark(fn, warmup=5, repeat=20, label="function"):
   for _ in range(warmup):
       fn()
   sync()
   times = []
   for _ in range(repeat):
       start = time.perf_counter()
       out = fn()
       sync()
       end = time.perf_counter()
       times.append((end - start) * 1000)
   return {
       "label": label,
       "mean_ms": float(np.mean(times)),
       "median_ms": float(np.median(times)),
       "min_ms": float(np.min(times)),
       "max_ms": float(np.max(times)),
   }
def show_result_table(rows, title):
   df = pd.DataFrame(rows)
   print("\n" + title)
   print(df.to_string(index=False))
   return df
def assert_close(name, actual, expected, atol=1e-4, rtol=1e-4):
   torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
   print(f"{name}: correctness check passed")

定义用于向量加法、矩阵加法和矩阵乘法的分块 cuTile 内核

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[5] cuTile kernels are defined only if cuda.tile imports successfully")
print("=" * 90)
if cutile_import_ok:
   ConstInt = ct.Constant[int]
   @ct.kernel
   def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       a_tile = ct.load(a, index=(bid,), shape=(TILE,))
       b_tile = ct.load(b, index=(bid,), shape=(TILE,))
       c_tile = a_tile + b_tile
       ct.store(c, index=(bid,), tile=c_tile)
   @ct.kernel
   def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
       a_tile = ct.gather(a, offsets)
       b_tile = ct.gather(b, offsets)
       c_tile = a_tile + b_tile
       ct.scatter(c, offsets, c_tile)
   @ct.kernel
   def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
       cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
       rows = rows[:, None]
       cols = cols[None, :]
       a_tile = ct.gather(a, (rows, cols))
       b_tile = ct.gather(b, (rows, cols))
       c_tile = a_tile + b_tile
       ct.scatter(c, (rows, cols), c_tile)
   @ct.kernel
   def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
       acc = ct.full((TM, TN), 0, dtype=ct.float32)
       zero_pad = ct.PaddingMode.ZERO
       compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
       for k in range(num_tiles_k):
           a_tile = ct.load(
               A,
               index=(bid_m, k),
               shape=(TM, TK),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           b_tile = ct.load(
               B,
               index=(k, bid_n),
               shape=(TK, TN),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           acc = ct.mma(a_tile, b_tile, acc)
       out = ct.astype(acc, C.dtype)
       ct.store(C, index=(bid_m, bid_n), tile=out)
else:
   print("Skipping cuTile kernel definitions because cuda.tile is unavailable.")
print("\n" + "=" * 90)
print("[6] High-level wrappers")
print("=" * 90)
def vec_add_tutorial(a, b, use_gather=True):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
       grid = (math.ceil(a.numel() / TILE), 1, 1)
       kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
       ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
       return c
   return a + b
def matrix_add_tutorial(a, b):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE_M = 16
       TILE_N = 64
       grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matrix_add_gather_kernel,
           (a, b, c, TILE_M, TILE_N)
       )
       return c
   return a + b
def matmul_tutorial(A, B):
   if A.shape[1] != B.shape[0]:
       raise ValueError("A.shape[1] must equal B.shape[0]")
   if likely_runtime_ok and A.is_cuda:
       if A.dtype in (torch.float16, torch.bfloat16):
           TM, TN, TK = 128, 128, 64
       else:
           TM, TN, TK = 32, 32, 32
       C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
       grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matmul_kernel,
           (A, B, C, TM, TN, TK)
       )
       return C
   return A @ B
print("Wrappers ready.")
print(f"Execution backend: {'cuTile' if likely_runtime_ok else 'PyTorch fallback'}")

运行分块示例并针对 PyTorch 验证 float32 和 float16 Matmul

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[7] Example 1: tiled vector addition")
print("=" * 90)
torch.manual_seed(42)
N = 1_000_003
a = torch.randn(N, device=DEVICE, dtype=torch.float32)
b = torch.randn(N, device=DEVICE, dtype=torch.float32)
c = vec_add_tutorial(a, b, use_gather=True)
expected = a + b
assert_close("Vector addition", c, expected)
print(f"Input shape: {tuple(a.shape)}")
print(f"Output shape: {tuple(c.shape)}")
print(f"First five output values: {c[:5].detach().cpu().numpy()}")
print("\n" + "=" * 90)
print("[8] Example 2: tiled matrix addition with boundary-safe gather/scatter")
print("=" * 90)
M, N = 777, 1001
A = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
B = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
C = matrix_add_tutorial(A, B)
expected = A + B
assert_close("Matrix addition", C, expected)
print(f"A shape: {tuple(A.shape)}")
print(f"B shape: {tuple(B.shape)}")
print(f"C shape: {tuple(C.shape)}")
print("\n" + "=" * 90)
print("[9] Example 3: tiled matrix multiplication")
print("=" * 90)
M, K, N = 512, 768, 384
A32 = torch.randn(M, K, device=DEVICE, dtype=torch.float32)
B32 = torch.randn(K, N, device=DEVICE, dtype=torch.float32)
if DEVICE == "cuda":
   torch.set_float32_matmul_precision("high")
C32 = matmul_tutorial(A32, B32)
expected32 = A32 @ B32
if DEVICE == "cuda":
   atol, rtol = 1e-2, 1e-2
else:
   atol, rtol = 1e-4, 1e-4
assert_close("Float32 matmul", C32, expected32, atol=atol, rtol=rtol)
print(f"A32 shape: {tuple(A32.shape)}")
print(f"B32 shape: {tuple(B32.shape)}")
print(f"C32 shape: {tuple(C32.shape)}")
print("\n" + "=" * 90)
print("[10] Example 4: half precision matmul")
print("=" * 90)
if DEVICE == "cuda":
   A16 = torch.randn(M, K, device=DEVICE, dtype=torch.float16)
   B16 = torch.randn(K, N, device=DEVICE, dtype=torch.float16)
   C16 = matmul_tutorial(A16, B16)
   expected16 = A16 @ B16
   assert_close("Float16 matmul", C16, expected16, atol=5e-2, rtol=5e-2)
   print(f"A16 shape: {tuple(A16.shape)}")
   print(f"B16 shape: {tuple(B16.shape)}")
   print(f"C16 shape: {tuple(C16.shape)}")
else:
   print("Skipping float16 GPU matmul because CUDA is unavailable.")

对 cuTile 操作进行基准测试，并与 PyTorch 对比中位运行时间并可视化

复制代码已复制请使用其他浏览器

print("\n" + "=" * 90)
print("[11] Benchmarks")
print("=" * 90)
bench_rows = []
bench_rows.append(
   benchmark(
       lambda: vec_add_tutorial(a, b, use_gather=True),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} vector add"
   )
)
bench_rows.append(
   benchmark(
       lambda: a + b,
       label="PyTorch vector add"
   )
)
bench_rows.append(
   benchmark(
       lambda: matrix_add_tutorial(A, B),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} matrix add"
   )
)
bench_rows.append(
   benchmark(
       lambda: A + B,
       label="PyTorch matrix add"
   )
)
bench_rows.append(
   benchmark(
       lambda: matmul_tutorial(A32, B32),
       label=f"{'cuTile' if likely_runtime_ok else 'PyTorch'} fp32 matmul"
   )
)
bench_rows.append(
   benchmark(
       lambda: A32 @ B32,
       label="PyTorch fp32 matmul"
   )
)
bench_df = show_result_table(bench_rows, "Benchmark summary in milliseconds")
print("\n" + "=" * 90)
print("[12] Simple benchmark visualization")
print("=" * 90)
try:
   plt.figure(figsize=(10, 5))
   plt.bar(bench_df["label"], bench_df["median_ms"])
   plt.xticks(rotation=35, ha="right")
   plt.ylabel("Median time in ms")
   plt.title("cuTile tutorial benchmark comparison")
   plt.tight_layout()
   plt.show()
except Exception as e:
   print(f"Plot skipped: {e}")
print("\n" + "=" * 90)
print("[13] What to change next")
print("=" * 90)
next_steps = [
   {
       "experiment": "Tile size sweep",
       "what_to_change": "Change TILE, TILE_M, TILE_N, TM, TN, and TK",
       "why_it_matters": "Tile shape controls memory access, occupancy, and Tensor Core usage"
   },
   {
       "experiment": "Non-multiple dimensions",
       "what_to_change": "Use dimensions like 1003 x 771",
       "why_it_matters": "Tests padding, gather/scatter, and boundary behavior"
   },
   {
       "experiment": "Precision comparison",
       "what_to_change": "Compare float32, float16, and bfloat16",
       "why_it_matters": "Tensor Core paths are strongest for reduced precision"
   },
   {
       "experiment": "Operation fusion",
       "what_to_change": "Extend vector add to compute c = relu(a + b)",
       "why_it_matters": "Fusion reduces memory traffic and is a common GPU-kernel optimization"
   },
   {
       "experiment": "Attention kernel study",
       "what_to_change": "Study the repo's AttentionFMHA.py sample",
       "why_it_matters": "Attention shows why tiled kernels matter for transformer workloads"
   }
]
next_df = pd.DataFrame(next_steps)
print(next_df.to_string(index=False))
print("\n" + "=" * 90)
print("Tutorial completed.")
print("=" * 90)
if likely_runtime_ok:
   print("Real cuTile kernels were used.")
else:
   print("This runtime used the PyTorch fallback.")
   print("To run real cuTile kernels, use a GPU machine with NVIDIA Driver R580+ and CUDA Toolkit 13.1+.")