DeepSeek-V3.2-NVFP4 / inference /nvfp4_kernel.py

Upload inference/nvfp4_kernel.py with huggingface_hub

93931fb verified 16 days ago

13.3 kB

	"""
	NVFP4 kernels for DeepSeek inference on SM120 (RTX Pro 6000 Blackwell).

	This module provides NVFP4 equivalents for the FP8 kernels in kernel.py:
	- nvfp4_gemm: Block-scaled NVFP4 matrix multiplication
	- act_quant_nvfp4: Quantize activations to NVFP4

	Weight format:
	weight: [N, K/2] packed uint8 (2 FP4 E2M1 per byte)
	weight_scale: [N, K/16] FP8 E4M3 per-block scale
	weight_scale_2: [1] FP32 global scale
	"""

	import torch
	import triton
	import triton.language as tl
	from triton.tools.tensor_descriptor import TensorDescriptor
	from typing import Tuple, Optional
	import functools


	# NVFP4 E2M1 lookup table for dequantization
	NVFP4_LUT = torch.tensor([
	0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, # positive values
	-0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0, # negative values
	], dtype=torch.float32)


	@functools.lru_cache(maxsize=8)
	def _get_nvfp4_lut(device_str: str) -> torch.Tensor:
	"""Get NVFP4 lookup table on specified device (cached).

	Args:
	device_str: Device string (e.g., 'cpu', 'cuda:0')

	Returns:
	NVFP4 lookup table on the specified device
	"""
	return NVFP4_LUT.to(device=device_str)

	# Block size for NVFP4 (16 elements per scale)
	NVFP4_BLOCK_SIZE = 16


	def get_nvfp4_configs():
	"""Get kernel configs appropriate for SM120."""
	capability = torch.cuda.get_device_capability()[0]
	if capability == 12: # SM120 - Blackwell workstation
	return {
	"BLOCK_SIZE_M": 128,
	"BLOCK_SIZE_N": 128,
	"BLOCK_SIZE_K": 128,
	"num_stages": 2,
	"VEC_SIZE": 16,
	}
	else: # SM100 - Blackwell datacenter
	return {
	"BLOCK_SIZE_M": 128,
	"BLOCK_SIZE_N": 256,
	"BLOCK_SIZE_K": 256,
	"num_stages": 4,
	"VEC_SIZE": 16,
	}


	def linear_to_triton_scale(
	scale_linear: torch.Tensor,
	M: int,
	K: int,
	VEC_SIZE: int = 16,
	) -> torch.Tensor:
	"""
	Convert linear scale format to Triton's 5D TMA layout.

	Args:
	scale_linear: [M, K // VEC_SIZE] FP8 E4M3 scales
	M: Number of rows
	K: Number of columns
	VEC_SIZE: Number of elements per scale block

	Returns:
	scale_triton: [1, M//128, K//64, 2, 256] for TMA
	"""
	assert scale_linear.shape == (M, K // VEC_SIZE), \
	f"Expected shape {(M, K // VEC_SIZE)}, got {scale_linear.shape}"

	num_m_chunks = M // 128
	num_k_chunks = (K // VEC_SIZE) // 4

	# Reshape and permute for Triton's packed layout
	scale = scale_linear.reshape(num_m_chunks, 4, 32, num_k_chunks, 4)
	scale = scale.permute(0, 3, 2, 1, 4) # [M//128, K//64, 32, 4, 4]
	scale = scale.reshape(num_m_chunks, num_k_chunks, 32, 16)
	scale = scale.reshape(1, num_m_chunks, num_k_chunks, 2, 256)

	return scale.contiguous()


	def dequantize_nvfp4(
	packed: torch.Tensor,
	scale: torch.Tensor,
	scale_2: torch.Tensor,
	dtype: torch.dtype = torch.bfloat16,
	) -> torch.Tensor:
	"""
	Dequantize NVFP4 tensor to float for reference/fallback.

	Args:
	packed: [M, K/2] uint8 packed tensor
	scale: [M, K/16] FP8 E4M3 per-block scales
	scale_2: [1] FP32 global scale
	dtype: Output dtype

	Returns:
	tensor: [M, K] dequantized tensor
	"""
	M, K_half = packed.shape
	K = K_half * 2
	block_size = NVFP4_BLOCK_SIZE

	# Unpack two FP4 values per byte
	low = packed & 0x0F
	high = (packed >> 4) & 0x0F
	fp4_tensor = torch.stack([low, high], dim=-1).reshape(M, K)

	# Lookup table dequantization (use cached LUT for efficiency)
	lut = _get_nvfp4_lut(str(packed.device))
	tensor = lut[fp4_tensor.long()]

	# Apply dual-level scales
	scale_f32 = scale.to(torch.float32)
	tensor = tensor.reshape(M, K // block_size, block_size)
	tensor = tensor * scale_f32.unsqueeze(-1) * scale_2
	tensor = tensor.reshape(M, K)

	return tensor.to(dtype)


	def nvfp4_gemm_dequant(
	x: torch.Tensor,
	weight: torch.Tensor,
	weight_scale: torch.Tensor,
	weight_scale_2: torch.Tensor,
	) -> torch.Tensor:
	"""
	NVFP4 GEMM via dequantization fallback.

	This is a simple but slow implementation that dequantizes weights
	to bfloat16 and uses standard matmul. Use for testing/validation.

	Args:
	x: Input activation [M, K] in bfloat16
	weight: NVFP4 weight [N, K/2] packed uint8
	weight_scale: Per-block scales [N, K/16] FP8 E4M3
	weight_scale_2: Global scale [1] FP32

	Returns:
	y: Output [M, N] in bfloat16
	"""
	N, K_half = weight.shape
	K = K_half * 2

	# Dequantize weight to bfloat16
	weight_bf16 = dequantize_nvfp4(weight, weight_scale, weight_scale_2, dtype=torch.bfloat16)

	# Standard matmul
	return torch.matmul(x, weight_bf16.T)


	@triton.jit
	def nvfp4_gemm_kernel(
	a_desc, # Activation TMA descriptor [M, K/2]
	a_scale_desc, # Activation scale TMA descriptor
	b_desc, # Weight TMA descriptor [N, K/2]
	b_scale_desc, # Weight scale TMA descriptor
	c_desc, # Output TMA descriptor [M, N]
	M: tl.constexpr,
	N: tl.constexpr,
	K: tl.constexpr,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	VEC_SIZE: tl.constexpr,
	rep_m: tl.constexpr,
	rep_n: tl.constexpr,
	rep_k: tl.constexpr,
	NUM_STAGES: tl.constexpr,
	):
	"""Triton NVFP4 block-scaled GEMM kernel."""
	pid = tl.program_id(axis=0)
	num_pid_m = tl.cdiv(M, BLOCK_M)
	pid_m = pid % num_pid_m
	pid_n = pid // num_pid_m

	offs_am = pid_m * BLOCK_M
	offs_bn = pid_n * BLOCK_N
	offs_k_a = 0
	offs_k_b = 0
	offs_scale_m = pid_m * rep_m
	offs_scale_n = pid_n * rep_n
	offs_scale_k = 0

	c0 = tl.zeros((1,), dtype=tl.int32)[0]

	accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
	a = a_desc.load([offs_am, offs_k_a])
	b = b_desc.load([offs_bn, offs_k_b])
	scale_a = a_scale_desc.load([c0, offs_scale_m, offs_scale_k, c0, c0])
	scale_b = b_scale_desc.load([c0, offs_scale_n, offs_scale_k, c0, c0])

	scale_a = scale_a.reshape(rep_m, rep_k, 32, 4, 4).trans(0, 3, 2, 1, 4).reshape(BLOCK_M, BLOCK_K // VEC_SIZE)
	scale_b = scale_b.reshape(rep_n, rep_k, 32, 4, 4).trans(0, 3, 2, 1, 4).reshape(BLOCK_N, BLOCK_K // VEC_SIZE)

	accumulator = tl.dot_scaled(a, scale_a, "e2m1", b.T, scale_b, "e2m1", accumulator)

	offs_k_a += BLOCK_K // 2
	offs_k_b += BLOCK_K // 2
	offs_scale_k += rep_k

	c_desc.store([offs_am, offs_bn], accumulator.to(tl.float16))


	def nvfp4_gemm(
	a: torch.Tensor,
	a_scale: torch.Tensor,
	b: torch.Tensor,
	b_scale: torch.Tensor,
	b_scale_2: Optional[torch.Tensor] = None,
	a_scale_2: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Perform NVFP4 GEMM using Triton kernel: y = a @ b.T

	For weight-only quantization (common case):
	- a: bfloat16 activation [M, K]
	- b: NVFP4 weight [N, K/2] packed
	- b_scale: [N, K/16] FP8 E4M3
	- b_scale_2: [1] FP32 global scale

	Args:
	a: Activation tensor [M, K] (bfloat16 or NVFP4 packed)
	a_scale: Activation scale [M, K/16] (or None for bfloat16 input)
	b: Weight tensor [N, K/2] packed uint8
	b_scale: Weight per-block scale [N, K/16] FP8 E4M3
	b_scale_2: Weight global scale [1] FP32
	a_scale_2: Activation global scale [1] FP32 (optional)

	Returns:
	y: Output [M, N]
	"""
	# Get dimensions
	if a.dtype == torch.uint8:
	M, K_half = a.shape
	K = K_half * 2
	else:
	M, K = a.shape

	N = b.shape[0]

	# Get configs
	configs = get_nvfp4_configs()
	BLOCK_M = configs["BLOCK_SIZE_M"]
	BLOCK_N = configs["BLOCK_SIZE_N"]
	BLOCK_K = configs["BLOCK_SIZE_K"]
	VEC_SIZE = configs["VEC_SIZE"]
	num_stages = configs["num_stages"]

	# Check dimension alignment
	if M % BLOCK_M != 0 or N % BLOCK_N != 0 or K % BLOCK_K != 0:
	# Fall back to dequantization method for unaligned dimensions
	return nvfp4_gemm_dequant(a, b, b_scale, b_scale_2 if b_scale_2 is not None else torch.ones(1, device=a.device))

	# If activation is bfloat16, quantize it to NVFP4 first
	if a.dtype != torch.uint8:
	a_nvfp4, a_scale, a_scale_2 = quantize_act_nvfp4(a)
	else:
	a_nvfp4 = a

	# Convert scales to Triton format
	a_scale_triton = linear_to_triton_scale(a_scale, M, K, VEC_SIZE)
	b_scale_triton = linear_to_triton_scale(b_scale, N, K, VEC_SIZE)

	# Create TMA descriptors
	a_desc = TensorDescriptor.from_tensor(a_nvfp4, [BLOCK_M, BLOCK_K // 2])
	b_desc = TensorDescriptor.from_tensor(b, [BLOCK_N, BLOCK_K // 2])

	rep_m = BLOCK_M // 128
	rep_n = BLOCK_N // 128
	rep_k = BLOCK_K // VEC_SIZE // 4

	a_scale_block_shape = [1, rep_m, rep_k, 2, 256]
	b_scale_block_shape = [1, rep_n, rep_k, 2, 256]

	a_scale_desc = TensorDescriptor.from_tensor(a_scale_triton, block_shape=a_scale_block_shape)
	b_scale_desc = TensorDescriptor.from_tensor(b_scale_triton, block_shape=b_scale_block_shape)

	# Allocate output
	output = torch.empty((M, N), dtype=torch.float16, device=a.device)
	c_desc = TensorDescriptor.from_tensor(output, [BLOCK_M, BLOCK_N])

	# Launch kernel
	grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)

	nvfp4_gemm_kernel[grid](
	a_desc, a_scale_desc,
	b_desc, b_scale_desc,
	c_desc,
	M, N, K,
	BLOCK_M, BLOCK_N, BLOCK_K,
	VEC_SIZE,
	rep_m, rep_n, rep_k,
	num_stages,
	)

	# Apply global scales
	if a_scale_2 is not None and b_scale_2 is not None:
	output = output * (a_scale_2 * b_scale_2)
	elif b_scale_2 is not None:
	output = output * b_scale_2

	return output.to(torch.bfloat16)


	def quantize_act_nvfp4(
	x: torch.Tensor,
	block_size: int = NVFP4_BLOCK_SIZE,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Quantize activation to NVFP4 format.

	Args:
	x: Input tensor [M, K] in float/bfloat16
	block_size: Number of elements per scale block

	Returns:
	packed: [M, K/2] uint8 packed tensor
	scale: [M, K/block_size] FP8 E4M3 per-block scales
	scale_2: [1] FP32 global scale
	"""
	M, K = x.shape
	device = x.device
	x = x.to(torch.float32)

	# Compute global scale
	amax = x.abs().max()
	scale_2 = amax / (6.0 * 448.0)
	scale_2 = scale_2.clamp(min=1e-12)

	# Compute per-block scales
	x_blocks = x.reshape(M, K // block_size, block_size)
	block_amax = x_blocks.abs().amax(dim=-1)
	scale = (block_amax / (6.0 * scale_2)).clamp(min=1e-12, max=448.0)
	scale = scale.to(torch.float8_e4m3fn)

	# Quantize
	scale_f32 = scale.to(torch.float32)
	scale_expanded = (scale_f32 * scale_2).unsqueeze(-1)
	scaled_x = x_blocks / scale_expanded

	# Map to nearest NVFP4 values
	nvfp4_values = torch.tensor([0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], device=device)
	abs_x = scaled_x.abs()
	signs = scaled_x.sign()

	diffs = (abs_x.unsqueeze(-1) - nvfp4_values).abs()
	indices = diffs.argmin(dim=-1)

	fp4_values = indices.to(torch.uint8)
	fp4_values = torch.where(signs < 0, fp4_values + 8, fp4_values)
	fp4_tensor = fp4_values.reshape(M, K)

	# Pack two values per byte
	packed = (fp4_tensor[:, 0::2] & 0x0F) \| ((fp4_tensor[:, 1::2] & 0x0F) << 4)
	packed = packed.to(torch.uint8)

	return packed, scale, scale_2.reshape(1)


	def act_quant_nvfp4(
	x: torch.Tensor,
	block_size: int = NVFP4_BLOCK_SIZE,
	scale_fmt: Optional[str] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Quantize activation with interface matching original act_quant.

	Args:
	x: Input tensor
	block_size: Block size for quantization
	scale_fmt: Scale format (unused, for API compatibility)

	Returns:
	y: Quantized tensor [M, K/2] packed uint8
	s: Scale tensor [M, K/block_size] FP8 E4M3
	"""
	packed, scale, scale_2 = quantize_act_nvfp4(x.view(-1, x.size(-1)), block_size)

	# Store scale_2 as attribute for later use
	scale.scale_2 = scale_2

	return packed.view(x.shape[:-1], x.size(-1) // 2), scale.view(x.shape[:-1], -1)


	# Test function
	def test_nvfp4_gemm():
	"""Test NVFP4 GEMM implementation."""
	M, N, K = 256, 512, 1024

	# Create random tensors
	x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)

	# Create fake NVFP4 weight
	weight_bf16 = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)

	# Quantize weight to NVFP4
	weight_packed, weight_scale, weight_scale_2 = quantize_act_nvfp4(weight_bf16)

	# Reference: dequantize and matmul
	weight_deq = dequantize_nvfp4(weight_packed, weight_scale, weight_scale_2, dtype=torch.bfloat16)
	ref = torch.matmul(x, weight_deq.T)

	# Test dequant path
	out_deq = nvfp4_gemm_dequant(x, weight_packed, weight_scale, weight_scale_2)

	# Compare
	error = (ref - out_deq).abs().mean()
	print(f"PASS: NVFP4 GEMM dequant test: mean abs error = {error:.6f}")

	return True


	if __name__ == "__main__":
	test_nvfp4_gemm()