Comment # 8 on bug 1204267 from Aaron Puchert

Since I'm not sure what the precise target machine is, I've used flags similar
to how we build LLVM itself (see the specfile):

    llc -march=arm --float-abi=hard -mattr=+armv7-a,+vfp3d16

This reproduces the crash, just with a slightly different message:

LLVM ERROR: Cannot select: t933: v4i32 = ARMISD::VCMPZ t1307, Constant:i32<2>
  t1307: v4i32,ch = ARMISD::VLD1DUP<(load (s32) from %ir.584)> t0, t1429:1,
Constant:i32<4>
    t1429: i32,i32,ch = load<(load (s32) from
%ir."&context.constants_ptr[]5618", align 8), <post-inc>> t0, t2,
Constant:i32<64>
      t2: i32,ch = CopyFromReg t0, Register:i32 %45
        t1: i32 = Register %45
      t212: i32 = Constant<64>
    t49: i32 = Constant<4>
  t28: i32 = Constant<2>

What's different is the added IR names, but they're not immediately helpful:
there is a "&context.constants_ptr[]56" in the source, maybe there was
disambiguation.

The crash is reproducible on the current main branch, so it's still not fixed.
With

    bugpoint --run-llc <input-file> --tool-args <options as above>

we can reduce it to this:

define void @fs_variant_partial() {
entry:
  %output = alloca <4 x float>, align 16
  br label %loop_begin

loop_begin:                                       ; preds = %skip, %entry
  br i1 undef, label %skip, label %0

0:                                                ; preds = %loop_begin
  %1 = icmp uge <4 x i32> zeroinitializer, undef
  %2 = sext <4 x i1> %1 to <4 x i32>
  %3 = load i32, i32* undef, align 4
  %4 = insertelement <4 x i32> undef, i32 %3, i32 3
  %5 = trunc <4 x i32> %2 to <4 x i1>
  %6 = select <4 x i1> %5, <4 x i32> zeroinitializer, <4 x i32> %4
  %7 = insertvalue [4 x <4 x i32>] undef, <4 x i32> %6, 0
  %8 = insertvalue [4 x <4 x i32>] %7, <4 x i32> undef, 1
  %9 = insertvalue [4 x <4 x i32>] %8, <4 x i32> undef, 2
  %10 = insertvalue [4 x <4 x i32>] %9, <4 x i32> undef, 3
  %11 = extractvalue [4 x <4 x i32>] %10, 0
  %12 = bitcast <4 x i32> %11 to <4 x float>
  %13 = fmul <4 x float> zeroinitializer, %12
  %14 = fadd <4 x float> %13, zeroinitializer
  %15 = fadd <4 x float> %14, zeroinitializer
  %16 = bitcast <4 x float> %15 to <4 x i32>
  %17 = insertvalue [4 x <4 x i32>] undef, <4 x i32> %16, 0
  %18 = insertvalue [4 x <4 x i32>] %17, <4 x i32> undef, 1
  %19 = insertvalue [4 x <4 x i32>] %18, <4 x i32> undef, 2
  %20 = insertvalue [4 x <4 x i32>] %19, <4 x i32> undef, 3
  %21 = extractvalue [4 x <4 x i32>] %20, 0
  %22 = bitcast <4 x i32> %21 to <4 x float>
  store <4 x float> %22, <4 x float>* %output, align 16
  br label %skip

skip:                                             ; preds = %0, %loop_begin
  br label %loop_begin
}

Crash is slightly different now:

LLVM ERROR: Cannot select: t48: v4i32 = ARMISD::VCMPZ undef:v4i32,
Constant:i32<2>
  t3: v4i32 = undef
  t47: i32 = Constant<2>

This obviously corresponds to the

  %1 = icmp uge <4 x i32> zeroinitializer, undef

With that knowledge we can reduce further:

define <4 x i32> @fs_variant_partial() {
  %1 = icmp uge <4 x i32> zeroinitializer, undef
  %2 = sext <4 x i1> %1 to <4 x i32>
  ret <4 x i32> %2
}

or

define <4 x i32> @fs_variant_partial(<4 x i32> %0) {
  %2 = icmp uge <4 x i32> zeroinitializer, %0
  %3 = sext <4 x i1> %2 to <4 x i32>
  ret <4 x i32> %3
}

I'll see if I can spot where we're missing something, but likely I'll just file
a bug and let the ARM people figure it where this should be fixed. From the
looks of it we're simply not able to lower "icmp uge <4 x i32> zeroinitializer,
...", and the nested instructions have nothing to do with it.