diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 5254fab1cdc914..791006c48b5ddf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -666,10 +666,9 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) return false; + // TODO: We could handle this with a length-changing shuffle. auto *VecTy = cast(I.getType()); - auto *ScalarTy = VecTy->getScalarType(); - auto *SrcVecTy = dyn_cast(SrcVec->getType()); - if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) + if (SrcVec->getType() != VecTy) return false; // Ignore bogus insert/extract index. @@ -683,6 +682,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector Mask(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; + + Type *ScalarTy = VecTy->getScalarType(); InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); @@ -697,33 +698,14 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); - bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; - // If the lengths of the two vectors are not equal, - // we need to add a length-change vector. Add this cost. - SmallVector SrcMask; - if (NeedLenChg) { - SrcMask.assign(NumElts, PoisonMaskElem); - SrcMask[Index] = Index; - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - SrcVecTy, SrcMask, CostKind); - } - if (NewCost > OldCost) return false; - Value *NewShuf; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> + // shuffle DestVec, (fneg SrcVec), Mask Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); - if (NeedLenChg) { - // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask); - NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); - } else { - // shuffle DestVec, (fneg SrcVec), Mask - NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); - } - - replaceValue(I, *NewShuf); + Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + replaceValue(I, *Shuf); return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index 83f94ba46a072f..df5fcdb7beb656 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -18,19 +18,6 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext0_v2f32v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 0 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 0 - ret <4 x float> %r -} - ; Eliminating extract/insert is profitable. define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { @@ -45,19 +32,6 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext2_v2f32v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 2 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 2 - ret <4 x float> %r -} - ; Eliminating extract/insert is still profitable. Flags propagate. define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { @@ -72,25 +46,6 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } -define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { -; SSE-LABEL: @ext1_v2f64v4f64( -; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 -; SSE-NEXT: ret <4 x double> [[R]] -; -; AVX-LABEL: @ext1_v2f64v4f64( -; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> -; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 1 - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 1 - ret <4 x double> %r -} - ; The vector fneg would cost twice as much as the scalar op with SSE, ; so we don't transform there (the shuffle would also be more expensive). @@ -112,19 +67,6 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - %n = fneg float %e - %r = insertelement <8 x float> %y, float %n, i32 7 - ret <8 x float> %r -} - ; Same as above with an extra use of the extracted element. define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { @@ -149,21 +91,6 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32_use1( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: call void @use(float [[E]]) -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - call void @use(float %e) - %n = fneg float %e - %r = insertelement <8 x float> %y, float %n, i32 3 - ret <8 x float> %r -} - ; Negative test - the transform is likely not profitable if the fneg has another use. define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { @@ -181,21 +108,6 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32_use2( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: call void @use(float [[N]]) -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - %n = fneg float %e - call void @use(float %n) - %r = insertelement <8 x float> %y, float %n, i32 3 - ret <8 x float> %r -} - ; Negative test - can't convert variable index to a shuffle. define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { @@ -211,19 +123,6 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 % ret <2 x double> %r } -define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { -; CHECK-LABEL: @ext_index_var_v2f64v4f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] -; CHECK-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 %index - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 %index - ret <4 x double> %r -} - ; Negative test - require same extract/insert index for simple shuffle. ; TODO: We could handle this by adjusting the cost calculation. @@ -240,33 +139,6 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } -; Negative test - extract from an index greater than the vector width of the destination -define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @ext3_v4f64v2f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R]] -; - %e = extractelement <4 x double> %x, i32 3 - %n = fneg nsz double %e - %r = insertelement <2 x double> %y, double %n, i32 1 - ret <2 x double> %r -} - -define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { -; CHECK-LABEL: @ext1_v2f64v4f64_ins0( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 -; CHECK-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 1 - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 0 - ret <4 x double> %r -} - ; Negative test - avoid changing poison ops define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { @@ -282,19 +154,6 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext12_v2f32v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 6 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 12 - ret <4 x float> %r -} - ; This used to crash because we assumed matching a true, unary fneg instruction. define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { @@ -322,16 +181,3 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { %r = insertelement <2 x float> %y, float %s, i32 1 ret <2 x float> %r } - -define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( -; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 1 - %s = fsub nsz nnan float 0.0, %e - %r = insertelement <4 x float> %y, float %s, i32 1 - ret <4 x float> %r -}