Open
Description
The below llvm code produces incorrect amdgpu code. In particular, see this sequence. The code is loading a float16x8, adding [0..7] and storing the result back out. The third v_pk_add_f16
has had the constant [2.0, 3.0] removed. The error is introduced inside si-fold-operands
. I was able to reproduce with 7babf22461deb846827859de2e472a062815095b
.
Let me know if I can provide any more details.
v_pk_add_f16 v7, v7, s8
v_pk_add_f16 v6, v6, s9
v_pk_add_f16 v5, v5, 0 <<<<=== not correct
v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
; ModuleID = 'stepper.mojo'
source_filename = "stepper.mojo"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
define dso_local amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38(ptr noundef readonly captures(none) %0, ptr noundef writeonly captures(none) %1, i32 noundef %2) #0 {
%.global = addrspacecast ptr %1 to ptr addrspace(1)
%.global1 = addrspacecast ptr %0 to ptr addrspace(1)
%4 = tail call <8 x i32> @llvm.stepvector.v8i32()
%5 = sitofp <8 x i32> %4 to <8 x half>
%6 = zext i32 %2 to i64
%.not = icmp eq i32 %2, 0
br i1 %.not, label %._crit_edge, label %.lr.ph
.lr.ph: ; preds = %3, %.lr.ph
%7 = phi i64 [ %8, %.lr.ph ], [ 0, %3 ]
%8 = add nuw nsw i64 %7, 8
%9 = getelementptr inbounds nuw half, ptr addrspace(1) %.global1, i64 %7
%10 = load <8 x half>, ptr addrspace(1) %9, align 2
%11 = fadd contract <8 x half> %10, %5
%12 = getelementptr inbounds nuw half, ptr addrspace(1) %.global, i64 %7
store <8 x half> %11, ptr addrspace(1) %12, align 2
%13 = icmp samesign ult i64 %8, %6
br i1 %13, label %.lr.ph, label %._crit_edge
._crit_edge: ; preds = %.lr.ph, %3
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare <8 x i32> @llvm.stepvector.v8i32() #1
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx942" "target-features" "uniform-work-group-size"="false" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 6
.text
.globl stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38
.p2align 8
.type stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38,@function
stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38:
stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38$local:
.type stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38$local,@function
s_load_dword s6, s[0:1], 0x10
s_mov_b32 s7, 0
s_waitcnt lgkmcnt(0)
s_cmp_eq_u32 s6, 0
s_cbranch_scc1 .LBB0_3
s_load_dwordx4 s[0:3], s[0:1], 0x0
s_mov_b32 s8, 0x47004600
s_mov_b32 s9, 0x45004400
s_mov_b64 s[4:5], 0
v_mov_b32_e32 v2, 0
v_mov_b64_e32 v[0:1], s[6:7]
.LBB0_2:
s_waitcnt lgkmcnt(0)
global_load_dwordx4 v[4:7], v2, s[0:1]
s_add_u32 s4, s4, 8
s_addc_u32 s5, s5, 0
s_add_u32 s0, s0, 16
s_addc_u32 s1, s1, 0
v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
s_waitcnt vmcnt(0)
v_pk_add_f16 v7, v7, s8
v_pk_add_f16 v6, v6, s9
v_pk_add_f16 v5, v5, 0
v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
global_store_dwordx4 v2, v[4:7], s[2:3]
s_add_u32 s2, s2, 16
s_addc_u32 s3, s3, 0
s_cbranch_vccnz .LBB0_2
.LBB0_3:
s_endpgm