[InstCombine] Div ceil optimizations  (#190175)

Relates: https://github.com/llvm/llvm-project/issues/187838

This PR improves handling of `div_ceil` from rust (which emits a div +
rem).

Currently, these three rust functions:
```rust
use std::hint::assert_unchecked;

#[unsafe(no_mangle)]
pub fn div_ceil_without_assume(x: u32) -> u32 {
    x.div_ceil(7)
}

#[unsafe(no_mangle)]
pub fn div_ceil_with_assume(x: u32) -> u32 {
    unsafe {
        assert_unchecked(x <= u32::MAX - 7);
    }
    x.div_ceil(7)
}

#[unsafe(no_mangle)]
pub fn div_ceil_with_range(x: u32) -> u32 {
    x.count_zeros().div_ceil(7)
}
```

Will emit this IR (cleaned up): The IR looks pretty good to me as both
the assert_unchecked and the popcount are providing range information.

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %r = urem i32 %x, 7
  %_4.not = icmp ne i32 %r, 0
  %0 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %0
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %d = udiv i32 %x, 7
  %r = urem i32 %x, 7
  %_5.not = icmp ne i32 %r, 0
  %0 = zext i1 %_5.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %0
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %r4 = urem i8 %d.lhs.trunc, 7
  %_6.not = icmp ne i8 %r4, 0
  %1 = zext i1 %_6.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %1, %d.zext
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```
After running through opt and llc on main with popcnt for less noise:
`build/bin/opt -O2 -S < test.ll | build/bin/llc --x86-asm-syntax=intel
-mattr=+popcnt -O2 -o -`

```asm
div_ceil_without_assume:                # @div_ceil_without_assume                                                  
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        lea     eax, [8*rdx]                                                                                        
        mov     ecx, edx                                                                                            
        sub     ecx, eax                                                                                            
        xor     eax, eax                                                                                            
        add     ecx, edi                                                                                            
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
div_ceil_with_assume:                   # @div_ceil_with_assume                                                     
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        lea     eax, [8*rdx]                                                                                        
        mov     ecx, edx                                                                                            
        sub     ecx, eax                                                                                            
        xor     eax, eax                                                                                            
        add     ecx, edi                                                                                            
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
div_ceil_with_range:                    # @div_ceil_with_range                                                      
        not     edi                                                                                                 
        popcnt  ecx, edi                                                                                            
        lea     eax, [rcx + 8*rcx]                                                                                  
        lea     edx, [rcx + 4*rax]                                                                                  
        shr     edx, 8                                                                                              
        lea     esi, [8*rdx]                                                                                        
        sub     esi, edx                                                                                            
        xor     eax, eax                                                                                            
        cmp     cl, sil                                                                                             
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
```

(as a sidenote, just running llc seems to generate better code? I find
this a bit odd)

`build/bin/llc test.ll --x86-asm-syntax=intel -mattr=+popcnt -O2 -o -`

```asm
div_ceil_without_assume:                # @div_ceil_without_assume                                                  
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        mov     rax, rdx                                                                                            
        imul    ecx, edi, -1227133513                                                                               
        cmp     ecx, 613566757                                                                                      
        sbb     eax, -1                                                                                             
        ret                                                                                                         
div_ceil_with_assume:                   # @div_ceil_with_assume                                                     
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        mov     rax, rdx                                                                                            
        imul    ecx, edi, -1227133513                                                                               
        cmp     ecx, 613566757                                                                                      
        sbb     eax, -1                                                                                             
        ret                                                                                                         
div_ceil_with_range:                    # @div_ceil_with_range                                                      
        not     edi                                                                                                 
        popcnt  ecx, edi                                                                                            
        lea     eax, [rcx + 8*rcx]                                                                                  
        lea     eax, [rcx + 4*rax]                                                                                  
        shr     eax, 8                                                                                              
        imul    ecx, ecx, -73                                                                                       
        cmp     cl, 37 
```

Anyway the llvm IR when run through opt at -O2:

On main:

The only changes I see are the urem being rewritten to a mul + sub for
all the instructions, and the range information is persisted but not
used.

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_4.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_5.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_5.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %1 = mul i8 %d3, 7
  %r4.decomposed = sub i8 %d.lhs.trunc, %1
  %_6.not = icmp ne i8 %r4.decomposed, 0
  %2 = zext i1 %_6.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %2, %d.zext
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```

However, both the assume and the popcount provide range information so
we can do an optimization. Div_ceil is emitted as a udiv and urem. We
can combine them given the following rules:

Assuming you have floor division of X / Y, we can add Y - 1 to X and
floor division will always give us (floor_divide(X / Y) + 1) which gives
us the same result.

So the formula in general is:

```
div_ceil(X, Y) = X / Y + (1 if X % Y > 0 else 0)  -> X + Y - 1 / Y
```

But this fails since I forgot about wrapping. So we need a condition
that X + Y - 1 does not wrap, and of course Y cannot be 0. Technically
we should ignore Y / 1 since that's a trivial identity.

That gets us:

```
add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0))) -> udiv(add nuw(X, Y - 1), Y)
```

[Alive proof here](https://alive2.llvm.org/ce/z/nBvyv4)

This is where I thought I was done, because this should handle the range
and assume case (the assume case provides less information than the
popcount because popcount's range is narrower for X). So I thought I was
done.

Unfortunately this only optimizes one case (the assume case with the
wide code). The culprit here is in the popcount case, there's a trunc
for the popcount to bound it to an i8 since narrower arithmetic allows
for better optimizations before zero extending.

```llvm
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %r4 = urem i8 %d.lhs.trunc, 7
  %_6.not = icmp ne i8 %r4, 0
  %1 = zext i1 %_6.not to i32
```

So we need to handle another form, when the udiv needs to be zexted to
the return type (because it was previously trunced, due to the
popcount).

```
add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0))) -> zext(udiv(add nuw(X, Y - 1), Y))
```

[Alive2 Proof](https://alive2.llvm.org/ce/z/w7bRZW)

There was another oddity I found that trunced information wasn't passing
constant ranges, so I added that to ValueTracking.cpp to fix this fold
too.

On this branch the IR now transforms:

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_4.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %0 = add nuw i32 %x, 6
  %_0.sroa.0.0 = udiv i32 %0, 7
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %1 = add nuw nsw i8 %d.lhs.trunc, 6
  %2 = udiv i8 %1, 7
  %_0.sroa.0.0 = zext nneg i8 %2 to i32
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```

And we get this asm emitted:

```asm
div_ceil_without_assume:                # @div_ceil_without_assume
        mov     eax, edi
        movabs  rcx, 2635249153617166336
        mul     rcx
        lea     eax, [8*rdx]
        mov     ecx, edx
        sub     ecx, eax
        xor     eax, eax
        add     ecx, edi
        setne   al
        add     eax, edx
        ret
div_ceil_with_assume:                   # @div_ceil_with_assume
        lea     eax, [rdi + 6]
        movabs  rcx, 2635249153617166336
        mul     rcx
        mov     rax, rdx
        ret
div_ceil_with_range:                    # @div_ceil_with_range
        not     edi
        popcnt  eax, edi
        add     al, 6
        movzx   eax, al
        imul    eax, eax, 147
        shr     eax, 10
        ret
```
5 files changed
tree: 602c81e95c18e86d7d85fd82094d3c898c4cb090
  1. .ci/
  2. .github/
  3. bolt/
  4. clang/
  5. clang-tools-extra/
  6. cmake/
  7. compiler-rt/
  8. cross-project-tests/
  9. flang/
  10. flang-rt/
  11. libc/
  12. libclc/
  13. libcxx/
  14. libcxxabi/
  15. libsycl/
  16. libunwind/
  17. lld/
  18. lldb/
  19. llvm/
  20. llvm-libgcc/
  21. mlir/
  22. offload/
  23. openmp/
  24. orc-rt/
  25. polly/
  26. runtimes/
  27. third-party/
  28. utils/
  29. .clang-format
  30. .clang-format-ignore
  31. .clang-tidy
  32. .git-blame-ignore-revs
  33. .gitattributes
  34. .gitignore
  35. .mailmap
  36. CODE_OF_CONDUCT.md
  37. CONTRIBUTING.md
  38. LICENSE.TXT
  39. pyproject.toml
  40. README.md
  41. SECURITY.md
README.md

The LLVM Compiler Infrastructure

OpenSSF Scorecard OpenSSF Best Practices libc++

Welcome to the LLVM project!

This repository contains the source code for LLVM, a toolkit for the construction of highly optimized compilers, optimizers, and run-time environments.

The LLVM project has multiple components. The core of the project is itself called “LLVM”. This contains all of the tools, libraries, and header files needed to process intermediate representations and convert them into object files. Tools include an assembler, disassembler, bitcode analyzer, and bitcode optimizer.

C-like languages use the Clang frontend. This component compiles C, C++, Objective-C, and Objective-C++ code into LLVM bitcode -- and from there into object files, using LLVM.

Other components include: the libc++ C++ standard library, the LLD linker, and more.

Getting the Source Code and Building LLVM

Consult the Getting Started with LLVM page for information on building and running LLVM.

For information on how to contribute to the LLVM project, please take a look at the Contributing to LLVM guide.

Getting in touch

Join the LLVM Discourse forums, Discord chat, LLVM Office Hours or Regular sync-ups.

The LLVM project has adopted a code of conduct for participants to all modes of communication within the project.