diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d0794cb9bfde3d..c3b919921f23b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29851,17 +29851,103 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(Opc, dl, ExtVT, R, Amt)); } - // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we - // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. + // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors by using + // vXi16 vector operations. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); + MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); + // We can do this extra fast if each pair of i8 elements is shifted by the + // same amount by doing this SWAR style: use a shift to move the valid bits + // to the right position, mask out any bits which crossed from one element + // to the other. + if (Opc == ISD::SRL || Opc == ISD::SHL) { + APInt UndefElts; + SmallVector AmtBits; + if (getTargetConstantBitsFromNode(Amt, /*EltSizeInBits=*/8, UndefElts, + AmtBits, /*AllowWholeUndefs=*/true, + /*AllowPartialUndefs=*/false)) { + // This optimized lowering is only valid if the elements in a pair can + // be treated identically. + bool SameShifts = true; + SmallVector AmtBits16(NumElts / 2); + APInt UndefElts16 = APInt::getZero(AmtBits16.size()); + for (unsigned SrcI = 0, E = AmtBits.size(); SrcI != E; SrcI += 2) { + unsigned DstI = SrcI / 2; + // Both elements are undef? Make a note and keep going. + if (UndefElts[SrcI] && UndefElts[SrcI + 1]) { + AmtBits16[DstI] = APInt::getZero(16); + UndefElts16.setBit(DstI); + continue; + } + // Even element is undef? We will shift it by the same shift amount as + // the odd element. + if (UndefElts[SrcI]) { + AmtBits16[DstI] = AmtBits[SrcI + 1].zext(16); + continue; + } + // Odd element is undef? We will shift it by the same shift amount as + // the even element. + if (UndefElts[SrcI + 1]) { + AmtBits16[DstI] = AmtBits[SrcI].zext(16); + continue; + } + // Both elements are equal. + if (AmtBits[SrcI] == AmtBits[SrcI + 1]) { + AmtBits16[DstI] = AmtBits[SrcI].zext(16); + continue; + } + // One of the provisional i16 elements will not have the same shift + // amount. Let's bail. + SameShifts = false; + break; + } + + // We are only dealing with identical pairs and the operation is a + // logical shift. + if (SameShifts) { + // Cast the operand to vXi16. + SDValue R16 = DAG.getBitcast(VT16, R); + // Create our new vector of shift amounts. + SDValue Amt16 = getConstVector(AmtBits16, UndefElts16, VT16, DAG, dl); + // Perform the actual shift. + SDValue ShiftedR = DAG.getNode(Opc, dl, VT16, R16, Amt16); + // Now we need to construct a mask which will "drop" bits that get + // shifted past the LSB/MSB. For a logical shift left, it will look + // like: + // MaskLowBits = (0xff << Amt16) & 0xff; + // MaskHighBits = MaskLowBits << 8; + // Mask = MaskLowBits | MaskHighBits; + // + // This masking ensures that bits cannot migrate from one i8 to + // another. The construction of this mask will be constant folded. + // The mask for a logical right shift is nearly identical, the only + // difference is that 0xff is shifted right instead of left. + SDValue Cst255 = DAG.getConstant(0xff, dl, MVT::i16); + SDValue Splat255 = DAG.getSplat(VT16, dl, Cst255); + // The mask for the low bits is most simply expressed as an 8-bit + // field of all ones which is shifted in the exact same way the data + // is shifted but masked with 0xff. + SDValue MaskLowBits = DAG.getNode(Opc, dl, VT16, Splat255, Amt16); + MaskLowBits = DAG.getNode(ISD::AND, dl, VT16, MaskLowBits, Splat255); + SDValue Cst8 = DAG.getConstant(8, dl, MVT::i16); + SDValue Splat8 = DAG.getSplat(VT16, dl, Cst8); + // Thie mask for the high bits is the same as the mask for the low + // bits but shifted up by 8. + SDValue MaskHighBits = DAG.getNode(ISD::SHL, dl, VT16, MaskLowBits, Splat8); + SDValue Mask = DAG.getNode(ISD::OR, dl, VT16, MaskLowBits, MaskHighBits); + // Finally, we mask the shifted vector with the SWAR mask. + SDValue Masked = DAG.getNode(ISD::AND, dl, VT16, ShiftedR, Mask); + return DAG.getBitcast(VT, Masked); + } + } + } SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); - // Extend constant shift amount to vXi16 (it doesn't matter if the type - // isn't legal). + // Extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI (it + // doesn't matter if the type isn't legal). MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT); Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt); @@ -29885,7 +29971,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } } - MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt); SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt); diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 62b95eedc9d4f1..43c6e4b0db16f2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1226,6 +1226,67 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ret <8 x i16> %shift } +define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { +; SSE-LABEL: constant_shift_v16i8_pairs: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: constant_shift_v16i8_pairs: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v16i8_pairs: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v16i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [7,7,2,2,4,4,6,6,1,1,2,2,3,3,4,4] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8_pairs: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8_pairs: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X86-SSE-LABEL: constant_shift_v16i8_pairs: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: retl + %shift = lshr <16 x i8> %a, + ret <16 x i8> %shift +} + define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v16i8: ; SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 0ef5d650535d23..932f210e239932 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1345,6 +1345,72 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ret <16 x i16> %shift } +define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { +; AVX1-LABEL: constant_shift_v32i8_pairs: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096] +; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [257,16191,3855,771,32639,16191,7967,3855] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v32i8_pairs: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: constant_shift_v32i8_pairs: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v32i8_pairs: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v32i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v32i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v32i8_pairs: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v32i8_pairs: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %shift = lshr <32 x i8> %a, + ret <32 x i8> %shift +} + define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v32i8: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index efd73b4ca132bb..8b61540081a7c7 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -306,6 +306,29 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ret <32 x i16> %shift } +define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { +; AVX512DQ-LABEL: constant_shift_v64i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v64i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %shift = lshr <64 x i8> %a, + ret <64 x i8> %shift +} + define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: