Skip to content

[AMDGPU] incorrect operand folding for llvm.stepvector + packed ops #139317

Open
@raiseirql

Description

@raiseirql

The below llvm code produces incorrect amdgpu code. In particular, see this sequence. The code is loading a float16x8, adding [0..7] and storing the result back out. The third v_pk_add_f16 has had the constant [2.0, 3.0] removed. The error is introduced inside si-fold-operands. I was able to reproduce with 7babf22461deb846827859de2e472a062815095b.

Let me know if I can provide any more details.

        v_pk_add_f16 v7, v7, s8
        v_pk_add_f16 v6, v6, s9
        v_pk_add_f16 v5, v5, 0 <<<<=== not correct
        v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
; ModuleID = 'stepper.mojo'
source_filename = "stepper.mojo"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
define dso_local amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38(ptr noundef readonly captures(none) %0, ptr noundef writeonly captures(none) %1, i32 noundef %2) #0 {
  %.global = addrspacecast ptr %1 to ptr addrspace(1)
  %.global1 = addrspacecast ptr %0 to ptr addrspace(1)
  %4 = tail call <8 x i32> @llvm.stepvector.v8i32()
  %5 = sitofp <8 x i32> %4 to <8 x half>
  %6 = zext i32 %2 to i64
  %.not = icmp eq i32 %2, 0
  br i1 %.not, label %._crit_edge, label %.lr.ph

.lr.ph:                                           ; preds = %3, %.lr.ph
  %7 = phi i64 [ %8, %.lr.ph ], [ 0, %3 ]
  %8 = add nuw nsw i64 %7, 8
  %9 = getelementptr inbounds nuw half, ptr addrspace(1) %.global1, i64 %7
  %10 = load <8 x half>, ptr addrspace(1) %9, align 2
  %11 = fadd contract <8 x half> %10, %5
  %12 = getelementptr inbounds nuw half, ptr addrspace(1) %.global, i64 %7
  store <8 x half> %11, ptr addrspace(1) %12, align 2
  %13 = icmp samesign ult i64 %8, %6
  br i1 %13, label %.lr.ph, label %._crit_edge

._crit_edge:                                      ; preds = %.lr.ph, %3
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare <8 x i32> @llvm.stepvector.v8i32() #1

attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx942" "target-features" "uniform-work-group-size"="false" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}
        .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
        .amdhsa_code_object_version 6
        .text
        .globl  stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38
        .p2align        8
        .type   stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38,@function
stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38:
stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38$local:
        .type   stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38$local,@function
        s_load_dword s6, s[0:1], 0x10
        s_mov_b32 s7, 0
        s_waitcnt lgkmcnt(0)
        s_cmp_eq_u32 s6, 0
        s_cbranch_scc1 .LBB0_3
        s_load_dwordx4 s[0:3], s[0:1], 0x0
        s_mov_b32 s8, 0x47004600
        s_mov_b32 s9, 0x45004400
        s_mov_b64 s[4:5], 0
        v_mov_b32_e32 v2, 0
        v_mov_b64_e32 v[0:1], s[6:7]
.LBB0_2:
        s_waitcnt lgkmcnt(0)
        global_load_dwordx4 v[4:7], v2, s[0:1]
        s_add_u32 s4, s4, 8
        s_addc_u32 s5, s5, 0
        s_add_u32 s0, s0, 16
        s_addc_u32 s1, s1, 0
        v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
        s_waitcnt vmcnt(0)
        v_pk_add_f16 v7, v7, s8
        v_pk_add_f16 v6, v6, s9
        v_pk_add_f16 v5, v5, 0
        v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
        global_store_dwordx4 v2, v[4:7], s[2:3]
        s_add_u32 s2, s2, 16
        s_addc_u32 s3, s3, 0
        s_cbranch_vccnz .LBB0_2
.LBB0_3:
        s_endpgm

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions